# Word Embeddings

A word embedding is a class of approaches for representing words and documents using a
dense vector representation. It is an improvement over more the traditional bag-of-word model
encoding schemes where large sparse vectors were used to represent each word or to score each
word within a vector to represent an entire vocabulary. These representations were sparse
because the vocabularies were vast and a given word or document would be represented by a
large vector comprised mostly of zero values.

Instead, in an embedding, words are represented by dense vectors where a vector represents
the projection of the word into a continuous vector space. The position of a word within the
vector space is learned from text and is based on the words that surround the word when it is
used. The position of a word in the learned vector space is referred to as its embedding. Two
popular examples of methods of learning word embeddings from text include:
+ Word2Vec.
+ GloVe.

In addition to these carefully designed methods, a word embedding can be learned as part
of a deep learning model. This can be a slower approach, but tailors the model to a specific
training dataset.

In [1]:
pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [2]:
from gensim.models import Word2Vec
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
        ['this', 'is', 'the', 'second', 'sentence'],
        ['yet', 'another', 'sentence'],
        ['one', 'more', 'sentence', 'love'],
        ['and', 'the', 'final', 'sentence', 'solve']]

In [3]:
# train model
cbow_model = Word2Vec(sentences, vector_size = 10, window = 3, min_count=1, sg=0)

In [4]:
# summarize the loaded model
print(cbow_model)

Word2Vec<vocab=16, vector_size=10, alpha=0.025>


In [5]:
list(cbow_model.wv.key_to_index.keys())   # this is the vocab

['sentence',
 'the',
 'is',
 'this',
 'solve',
 'final',
 'and',
 'love',
 'more',
 'one',
 'another',
 'yet',
 'second',
 'word2vec',
 'for',
 'first']

In [6]:
# access vector for one word
cbow_model.wv.get_vector('love')

array([ 0.05455598,  0.08345091, -0.0145442 , -0.09208831,  0.04371774,
        0.00572208,  0.07440059, -0.00813585, -0.0263755 , -0.08752632],
      dtype=float32)

In [7]:
for key in cbow_model.wv.key_to_index.keys():
    print(key, ':', cbow_model.wv.get_vector(key))

sentence : [-0.00536351  0.00238484  0.05107331  0.09015599 -0.09308276 -0.0711995
  0.06464671  0.08974326 -0.0501915  -0.03765175]
the : [ 0.07379383 -0.01529509 -0.04534442  0.06555367 -0.04861008 -0.0181848
  0.02882639  0.00993654 -0.08292154 -0.0944667 ]
is : [ 0.07311766  0.05070262  0.06757693  0.00762866  0.06350891 -0.03405366
 -0.00946401  0.05768573 -0.07521638 -0.03936104]
this : [-0.07512096 -0.00929068  0.09539422 -0.07316343 -0.02336625 -0.01939589
  0.08080077 -0.0592867   0.00042713 -0.04753667]
solve : [-0.09603605  0.05007694 -0.08758304 -0.04394896 -0.00034404 -0.00295622
 -0.07661133  0.09616364  0.04980589  0.09235525]
final : [-0.08158192  0.04498189 -0.04134833  0.00827747  0.08496136 -0.04464175
  0.04521902 -0.06785722 -0.03552099  0.09398862]
and : [-0.0157806   0.00323172 -0.04137019 -0.0768177  -0.01509309  0.02468751
 -0.00885536  0.05536246 -0.02745937  0.02261946]
love : [ 0.05455598  0.08345091 -0.0145442  -0.09208831  0.04371774  0.00572208
  0.074400

In [8]:
cbow_model.wv.get_vector('analytics')

KeyError: "Key 'analytics' not present"

In [9]:
# save model
cbow_model.save('model.bin')

In [10]:
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec<vocab=16, vector_size=10, alpha=0.025>


In [11]:
cbow_model.wv.get_vector('love')

array([ 0.05455598,  0.08345091, -0.0145442 , -0.09208831,  0.04371774,
        0.00572208,  0.07440059, -0.00813585, -0.0263755 , -0.08752632],
      dtype=float32)

In [12]:
sg_model = Word2Vec(sentences, vector_size=10, window = 3, min_count=1, sg=1)

# access vector for one word
sg_model.wv.get_vector('love')

array([ 0.05455508,  0.08348128, -0.01442463, -0.09193361,  0.04362334,
        0.00568476,  0.07447571, -0.00811199, -0.02645334, -0.0874837 ],
      dtype=float32)

In [13]:
# [ 0.05455598,  0.08345091, -0.0145442 , -0.09208831,  0.04371774,
#         0.00572208,  0.07440059, -0.00813585, -0.0263755 , -0.08752632],

### Some computations using Word Embeddings

In [None]:
from gensim.models import KeyedVectors
# load the google word2vec model
path = r'D:\OneDrive\Google Drive Files\Training\1 MASTER\NLP Master\Word Embedding\WV -1'
filename = path + r'\GoogleNews-vectors-negative300.bin'
# filename = r'C:\Users\dell\Google Drive\DUMP\Desktop\Nomura NLP\Word Embedding\WV -1\GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [14]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")




In [15]:
print(type(model))
print(len(model))          # vocabulary size
print(model.vector_size)   # embedding dimension


<class 'gensim.models.keyedvectors.KeyedVectors'>
3000000
300


In [16]:
vector = model["king"]
print(vector[:10])  # print first 10 values


[ 0.12597656  0.02978516  0.00860596  0.13964844 -0.02563477 -0.03613281
  0.11181641 -0.19824219  0.05126953  0.36328125]


In [17]:
similar_words = model.most_similar("king", topn=10)

for word, score in similar_words:
    print(word, score)


kings 0.7138045430183411
queen 0.6510956883430481
monarch 0.6413194537162781
crown_prince 0.6204220056533813
prince 0.6159993410110474
sultan 0.5864824056625366
ruler 0.5797567367553711
princes 0.5646552443504333
Prince_Paras 0.5432944297790527
throne 0.5422105193138123


In [18]:
similarity = model.similarity("king", "queen")
print("Similarity:", similarity)


Similarity: 0.6510957


In [19]:
result = model.most_similar(
    positive=["king", "woman"],
    negative=["man"],
    topn=5
)

print(result)


[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581)]


In [20]:
word = "chatgpt"

if word in model:
    print("Word exists")
else:
    print("Word not in vocabulary")


Word not in vocabulary


In [22]:
def safe_similarity(w1, w2):
    if w1 in model and w2 in model:
        return model.similarity(w1, w2)
    else:
        return "One of the words not in vocabulary"

print(safe_similarity("king", "queen"))


0.6510957


In [None]:
model.get_vector('king')

In [None]:
len(model.get_vector('king'))

In [None]:
model.get_vector('queen')

In [None]:
# What is the woman equivalent of King ????
a = model.get_vector('king') + model.get_vector('woman') - model.get_vector('man')
model.cosine_similarities(a, [model.get_vector('queen')])

array([0.7300517], dtype=float32)

In [None]:
# calculate: (king - man) + woman = ?  (Queen)
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=3)
print(result)

[('queen', 0.7118193507194519), ('monarch', 0.6189674735069275), ('princess', 0.5902431011199951)]


In [None]:
# Who is the God of Cricket in India ?
result = model.most_similar(positive=['God', 'cricket', 'India'], negative=[], topn=3)
print(result)

[('cricketing', 0.657913863658905), ('Sachin', 0.6445838809013367), ('cricketers', 0.6345413327217102)]


In [None]:
# what is the female equivalent of the word "man"
result = model.most_similar(positive=['man', 'female'], negative=['male'], topn=3)
print(result)

[('woman', 0.7685461640357971), ('teenage_girl', 0.5872832536697388), ('lady', 0.5742953419685364)]


In [None]:
#Checking how similarity works.
print (model.similarity('strawberry', 'mango'))

0.63029367


In [None]:
print(model.similarity('novel', 'book'))

0.6121936


In [None]:
print(model.similarity('novel', 'mango'))

0.06800091


In [None]:
# Finding the odd one out.
model.doesnt_match('breakfast cereal dinner lunch'.split())
# model.doesnt_match('mango apple banana rose'.split())

'cereal'

In [None]:
result = model.most_similar(positive=['dog', 'newborn'], topn=3)
print(result)

[('puppy', 0.793572187423706), ('pup', 0.7656400799751282), ('kitten', 0.730669379234314)]


### Further Reading:

#### Word Embeddings
+ Word Embedding on Wikipedia.https://en.wikipedia.org/wiki/Word2vec
+ Word2Vec on Wikipedia. https://en.wikipedia.org/wiki/Word2vec
+ Google Word2Vec project. https://code.google.com/archive/p/word2vec/
+ Stanford GloVe project. https://nlp.stanford.edu/projects/glove/

### Articles
+ Messing Around With Word2Vec, 2016. https://quomodocumque.wordpress.com/2016/01/15/messing-around-with-word2vec/
+ Vector Space Models for the Digital Humanities, 2015. http://bookworm.benschmidt.org/posts/2015-10-25-Word-Embeddings.html
+ Gensim Word2Vec Tutorial, 2014. https://rare-technologies.com/word2vec-tutorial/

# Facebook's FastText

`fastText` is the improvised version of `word2vec`. `word2vec` basically considers words to build the representation. But `fastText` takes each character while computing the representation of the word.

In [None]:
sentences = [['I', 'love', 'nlp'],
['I', 'will', 'learn', 'nlp', 'in', '2','months'],
['nlp', 'is', 'future'],
['nlp', 'saves', 'time', 'and', 'solves',
'lot', 'of', 'industry', 'problems'],
['nlp', 'uses', 'machine', 'learning']]

In [None]:
from gensim.models import FastText
fast = FastText(sentences,vector_size=20, window=1, min_count=1, workers=5, min_n=1, max_n=2)

In [None]:
fast.wv.get_vector('future')

array([ 0.00718044,  0.00634451,  0.01015092,  0.00278108,  0.00071975,
        0.01481973, -0.01144717,  0.0085934 ,  0.00387139, -0.00861204,
       -0.01795045, -0.00222266,  0.0043997 ,  0.01099374,  0.00549521,
       -0.02154304,  0.02005067, -0.00923354,  0.00634542,  0.00346849],
      dtype=float32)

In [None]:
fast.wv.get_vector('vidhya')

array([-0.00516996,  0.00863833,  0.00496077,  0.0065316 ,  0.01097381,
        0.00640103,  0.0061447 ,  0.00292935, -0.00900661,  0.01208093,
        0.00329964,  0.00117949, -0.00334365,  0.00119693,  0.0070916 ,
       -0.00667899,  0.00937506, -0.0102112 ,  0.00654462, -0.00943527],
      dtype=float32)

In [None]:
fast.wv.key_to_index

{'nlp': 0,
 'I': 1,
 'future': 2,
 'love': 3,
 'will': 4,
 'learn': 5,
 'in': 6,
 '2': 7,
 'months': 8,
 'is': 9,
 'learning': 10,
 'machine': 11,
 'time': 12,
 'and': 13,
 'solves': 14,
 'lot': 15,
 'of': 16,
 'industry': 17,
 'problems': 18,
 'uses': 19,
 'saves': 20}

In [None]:
len(fast.wv.key_to_index)

21

In [23]:
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", 200)

In [24]:
data = pd.read_csv("/content/tweets_cleaned.csv")
data.head()

Unnamed: 0,id,label,tweet,cleaned_tweets_w/o_SW,cleaned_tweets_with_SW
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias,finally transparant silicon case thanks uncle yay sony xperia sonyexperias
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememories unplug relax iphone smartphone wifi connect,love talk makememories unplug relax iphone smartphone wifi connect
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,i am wired i know i am george i wa made that way iphone cute daventry home,wired know george way iphone cute daventry home
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple will not even talk to me about a question i have unless i pay them for their stupid support,amazing service apple talk question unless pay stupid support


In [25]:
tweets_list = list(data['cleaned_tweets_w/o_SW'].apply(lambda x: x.split()))
tweets_list[0] # list of lists, where each tweet is a list of tokens, finally we have a list of tweets

['fingerprint',
 'pregnancy',
 'test',
 'android',
 'apps',
 'beautiful',
 'cute',
 'health',
 'igers',
 'iphoneonly',
 'iphonesia',
 'iphone']

In [26]:
# Creating your own Word2Vec Model & Train
from gensim.models import Word2Vec
# train model
cbow_model = Word2Vec(tweets_list, vector_size = 300, window = 3, min_count=5, sg=0)

In [27]:
# summarize the loaded model
print(cbow_model)

Word2Vec<vocab=2421, vector_size=300, alpha=0.025>


In [28]:
cbow_model.wv.index_to_key[:20]  # this your vocab

['iphone',
 'apple',
 'i',
 'my',
 'the',
 'to',
 'a',
 'is',
 'samsung',
 'it',
 'and',
 'you',
 'new',
 'twitter',
 'for',
 'com',
 'phone',
 'me',
 'sony',
 'not']

In [29]:
len(cbow_model.wv.index_to_key)

2421

In [30]:
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""

    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in cbow_model.wv.index_to_key]

    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(cbow_model.wv.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean

# np.mean(model[doc], axis=0)

In [31]:
tweets_temp = data['cleaned_tweets_w/o_SW'].apply(document_vector)

In [32]:
tweets_temp[:5]  # displaying the 1st 5 tweets, as document vectors

Unnamed: 0,cleaned_tweets_w/o_SW
0,"[0.054268885, 0.40473282, -0.09620376, 0.09134357, 0.024051206, -0.37361565, 0.15609212, 0.44721785, -0.10511675, 0.070423044, -2.9611825e-05, -0.1428007, -0.0506938, 0.0485388, -0.16028607, -0.12..."
1,"[-0.014783407, 0.22535828, 0.01853086, 0.121349655, 0.020087698, -0.2346991, 0.25229293, 0.4267326, -0.056078713, -0.28192392, -0.05650697, -0.28570616, -0.042215772, 0.0671259, -0.22421312, -0.19..."
2,"[-0.037618868, 0.17099226, 0.10332838, 0.1895264, 0.0042260797, -0.14594029, 0.18567343, 0.44021693, 0.07365111, -0.15095136, 0.021655105, -0.23908228, -0.04965281, 0.05295978, -0.1884441, -0.0500..."
3,"[-0.018339146, 0.15211621, 0.16956869, 0.27515882, 0.0036363143, -0.117027506, 0.20179762, 0.4933991, 0.1370591, -0.20040296, 0.06633151, -0.3022457, -0.0562717, 0.08221877, -0.21520421, -0.018884..."
4,"[-0.046502292, 0.124211095, 0.14577249, 0.2412501, 0.0025359055, -0.095921405, 0.21098067, 0.47332457, 0.12218282, -0.24284135, 0.030270487, -0.298291, -0.063341506, 0.061859943, -0.22332464, -0.0..."


In [33]:
tweets_temp[0].shape  # each document vecotr is 300-dimensional !!

(300,)

In [34]:
type(tweets_temp)

In [35]:
# Combining all the document vectors into a singl numpy array (tweets_vec)
embedding_size = 300
tweets_vec = np.ones((len(tweets_temp), embedding_size))*np.nan
for i in range(tweets_vec.shape[0]):
    tweets_vec[i,:] = tweets_temp.iloc[i]

tweets_vec.shape # this itself is your final FEATURE MATRIX

(7920, 300)

In [36]:
# Create a new DF to store these new documnent features
df = pd.DataFrame(tweets_vec)
df['y'] = data['label']
df.dropna(how='any', axis=0, inplace=True)

In [37]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,y
0,0.054269,0.404733,-0.096204,0.091344,0.024051,-0.373616,0.156092,0.447218,-0.105117,0.070423,...,0.210348,0.133615,0.113033,0.15209,0.284028,-0.012608,-0.196937,0.207236,-0.103524,0
1,-0.014783,0.225358,0.018531,0.12135,0.020088,-0.234699,0.252293,0.426733,-0.056079,-0.281924,...,0.239038,0.112102,-0.126142,0.227138,0.226035,-0.027128,-0.177663,0.160725,-0.079999,0
2,-0.037619,0.170992,0.103328,0.189526,0.004226,-0.14594,0.185673,0.440217,0.073651,-0.150951,...,0.194427,0.137918,-0.029756,0.232818,0.298621,-0.026225,-0.108309,0.091437,-0.050207,0
3,-0.018339,0.152116,0.169569,0.275159,0.003636,-0.117028,0.201798,0.493399,0.137059,-0.200403,...,0.212774,0.180822,-0.050769,0.298632,0.328846,-0.041595,-0.089531,0.074694,-0.039046,0
4,-0.046502,0.124211,0.145772,0.24125,0.002536,-0.095921,0.210981,0.473325,0.122183,-0.242841,...,0.210272,0.159223,-0.080392,0.287627,0.332458,-0.025628,-0.097817,0.070081,-0.040654,1


In [38]:
df.shape

(7920, 301)

In [39]:
X_word_emb = df.drop('y', axis=1)
y = df['y']
X_word_emb.shape

(7920, 300)

In [40]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [41]:

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4, random_state=42)
WE_pipe = Pipeline([('SC', StandardScaler()), ('LR', LR1)] )

results = cross_validate(WE_pipe, X_word_emb, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2))

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2))


85.68 0.14
85.3 0.6


In [42]:
X = data['cleaned_tweets_w/o_SW']
y = data['label']

# we want to include only those words in the vocab which have min df of 5,
# means select only those words which occur ATLEAST in 5 documents!!
# AND SELECT the TOP 300 FEATURES ONLY to build the model
CV = CountVectorizer(min_df=5, max_features=300)

LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4)
CV_pipe = Pipeline([('CV', CV) , ('LR', LR1)] )
results = cross_validate(CV_pipe, X, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2))

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2))

CV.fit_transform(X)
len(CV.vocabulary_)  # no. of features AFTER applying the stopwords

88.96 0.06
87.77 0.99


300

In [None]:
pip install sentence-transformers


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer("all-MiniLM-L6-v2")

def compare_texts(text1, text2):
    emb1 = model.encode([text1])
    emb2 = model.encode([text2])
    score = cosine_similarity(emb1, emb2)[0][0]
    return score

score = compare_texts(
    "machine learning is powerful",
    "AI and deep learning are powerful"
)

print("Similarity Score:", score)
