# Word2Vec implementation

# Word Embeddings 

### To do:
1. Sentences
2. One hot representation
3. onehot represenation - emdedding layer keras to form matrix
4. embedding matrix, dims = 10, voc_size=100

In [39]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot

In [40]:
sent=['the glass of milk',
     'the glass of juice',
     'the cup of tea',
     'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']

In [41]:
voc_size = 100

## one hot representation

In [42]:
onehot_rep = [one_hot(word,voc_size) for word in sent]
print(onehot_rep)

[[24, 49, 96, 59], [24, 49, 96, 82], [24, 21, 96, 32], [67, 46, 83, 38, 29], [67, 46, 83, 38, 36], [9, 24, 79, 96, 15], [98, 75, 6, 38]]


In [43]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [44]:
import numpy as np

In [45]:
sent_length = 6
embedded_docs = pad_sequences(onehot_rep, padding='pre', maxlen=sent_length)

In [46]:
print(embedded_docs)

[[ 0  0 24 49 96 59]
 [ 0  0 24 49 96 82]
 [ 0  0 24 21 96 32]
 [ 0 67 46 83 38 29]
 [ 0 67 46 83 38 36]
 [ 0  9 24 79 96 15]
 [ 0  0 98 75  6 38]]


In [47]:
dim = 10
model = Sequential()
model.add(Embedding(voc_size, dim, input_length=sent_length))
model.compile('adam', 'mse') # mse: mean square error

In [48]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 6, 10)             1000      
                                                                 
Total params: 1,000
Trainable params: 1,000
Non-trainable params: 0
_________________________________________________________________


In [49]:
print(model.predict(embedded_docs))

[[[-0.0188207   0.03263413  0.01849456 -0.04932909  0.02874951
   -0.01093489 -0.02073695  0.04921693  0.04581043  0.00562332]
  [-0.0188207   0.03263413  0.01849456 -0.04932909  0.02874951
   -0.01093489 -0.02073695  0.04921693  0.04581043  0.00562332]
  [-0.01629499  0.02391062 -0.0334879  -0.00844414  0.02863825
   -0.00410058 -0.00731366  0.00630441 -0.04588494  0.02532015]
  [-0.03839983 -0.00069003  0.01564706 -0.01971688 -0.03147626
    0.00846225 -0.01350915 -0.01708933  0.04577149 -0.00103629]
  [-0.04710863  0.03783317  0.01165559  0.03996098  0.01339909
    0.002726   -0.04589028  0.04533957 -0.00614625 -0.04603988]
  [-0.00594475  0.01394012  0.02325122  0.01625795  0.01343659
   -0.03511859 -0.00090399  0.00043748  0.01110033  0.02943536]]

 [[-0.0188207   0.03263413  0.01849456 -0.04932909  0.02874951
   -0.01093489 -0.02073695  0.04921693  0.04581043  0.00562332]
  [-0.0188207   0.03263413  0.01849456 -0.04932909  0.02874951
   -0.01093489 -0.02073695  0.04921693  0.0458

2023-04-07 12:15:16.089778: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [54]:
from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

# define documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work', 
        'Could have done better.']

# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

# integer encode the documents
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# summarize the model
print(model.summary())

# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)

# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

[[46, 34], [20, 30], [37, 5], [19, 30], [29], [42], [45, 5], [49, 20], [45, 30], [27, 43, 34, 39]]
[[46 34  0  0]
 [20 30  0  0]
 [37  5  0  0]
 [19 30  0  0]
 [29  0  0  0]
 [42  0  0  0]
 [45  5  0  0]
 [49 20  0  0]
 [45 30  0  0]
 [27 43 34 39]]
Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 4, 8)              400       
                                                                 
 flatten_2 (Flatten)         (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None


2023-04-07 12:42:59.833431: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Accuracy: 90.000004


2023-04-07 12:43:00.423985: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


## Word2vec using pretrained gensim

In [55]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.1-cp310-cp310-macosx_11_0_arm64.whl (24.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: gensim
Successfully installed gensim-4.3.1


In [56]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [57]:
import gensim.downloader as api

In [58]:
wv = api.load('word2vec-google-news-300')
vec_king = wv['king']



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [59]:
vec_king.shape

(300,)

In [60]:
vec_king

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

## word2vec from scratch

In [72]:
import pandas as pd
messages = pd.read_csv('spam.csv', encoding="windows-1250")

In [75]:
messages

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ě_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [77]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [79]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ammaarkhan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [81]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['v2'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [82]:
corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s',
 'u dun say so early hor u c already then say',
 'nah i don t think he go to usf he life around here though',
 'freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press to copy your friend callertune',
 'winner a a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hour only',
 'had your mobile month or more u r entitled to update to the latest colour mobile with camera for free call the mobile up

In [83]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [84]:
words =[]
for sent in corpus:
    sent_token = sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [85]:
words

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'in',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'to',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'go',
  'to',
  'usf',
  'he',
  'life',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'to',
  'send',
  'to',
  'rcv'],
 ['even',
  'my',
  'brother',
  'is',
  'not',
  'like',
  'to',
  'spea

In [86]:
import gensim

In [91]:
# training word2vec from scratch
model=gensim.models.Word2Vec(words)

In [92]:
# get all the vocabulary size
model.wv.index_to_key

['you',
 'to',
 'the',
 'and',
 'it',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'get',
 'at',
 'be',
 'if',
 'will',
 'ur',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'go',
 'how',
 'up',
 'when',
 'ok',
 'day',
 'what',
 'free',
 'from',
 'all',
 'out',
 'know',
 'll',
 'come',
 'like',
 'time',
 'good',
 'then',
 'am',
 'got',
 'wa',
 'there',
 'he',
 'text',
 'only',
 'love',
 'want',
 'send',
 'txt',
 'need',
 'one',
 'today',
 'going',
 'by',
 'home',
 'don',
 'about',
 'stop',
 'she',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'our',
 'think',
 'dont',
 'tell',
 'week',
 'phone',
 'hi',
 'new',
 'later',
 'they',
 'any',
 'her',
 'pls',
 'ha',
 'please',
 'co',
 'did',
 'msg',
 'been',
 'min',
 'an',
 'some',
 'dear',
 'make',
 'here',
 'night',
 'message',
 'who',
 'say',
 'well',
 're',
 'where',
 'thing',
 'much',
 'grea

In [89]:
model.corpus_count

5569

In [90]:
model.epochs

5

In [94]:
model.wv.similar_by_word('good')

[('day', 0.9990658164024353),
 ('hope', 0.998992919921875),
 ('great', 0.9989002346992493),
 ('my', 0.9987500905990601),
 ('and', 0.9987401366233826),
 ('happy', 0.9986902475357056),
 ('did', 0.9986728429794312),
 ('love', 0.9986676573753357),
 ('well', 0.9986157417297363),
 ('thing', 0.998526394367218)]

In [95]:
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)

In [96]:
from tqdm import tqdm

In [97]:
x=[]
for i in tqdm(range(len(words))):
    print("hello",i)
    x.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 26%|█████████▍                           | 1429/5569 [00:00<00:00, 7322.26it/s]

hello 0
hello 1
hello 2
hello 3
hello 4
hello 5
hello 6
hello 7
hello 8
hello 9
hello 10
hello 11
hello 12
hello 13
hello 14
hello 15
hello 16
hello 17
hello 18
hello 19
hello 20
hello 21
hello 22
hello 23
hello 24
hello 25
hello 26
hello 27
hello 28
hello 29
hello 30
hello 31
hello 32
hello 33
hello 34
hello 35
hello 36
hello 37
hello 38
hello 39
hello 40
hello 41
hello 42
hello 43
hello 44
hello 45
hello 46
hello 47
hello 48
hello 49
hello 50
hello 51
hello 52
hello 53
hello 54
hello 55
hello 56
hello 57
hello 58
hello 59
hello 60
hello 61
hello 62
hello 63
hello 64
hello 65
hello 66
hello 67
hello 68
hello 69
hello 70
hello 71
hello 72
hello 73
hello 74
hello 75
hello 76
hello 77
hello 78
hello 79
hello 80
hello 81
hello 82
hello 83
hello 84
hello 85
hello 86
hello 87
hello 88
hello 89
hello 90
hello 91
hello 92
hello 93
hello 94
hello 95
hello 96
hello 97
hello 98
hello 99
hello 100
hello 101
hello 102
hello 103
hello 104
hello 105
hello 106
hello 107
hello 108
hello 109
hello 110


 55%|████████████████████▍                | 3068/5569 [00:00<00:00, 7902.12it/s]

hello 1491
hello 1492
hello 1493
hello 1494
hello 1495
hello 1496
hello 1497
hello 1498
hello 1499
hello 1500
hello 1501
hello 1502
hello 1503
hello 1504
hello 1505
hello 1506
hello 1507
hello 1508
hello 1509
hello 1510
hello 1511
hello 1512
hello 1513
hello 1514
hello 1515
hello 1516
hello 1517
hello 1518
hello 1519
hello 1520
hello 1521
hello 1522
hello 1523
hello 1524
hello 1525
hello 1526
hello 1527
hello 1528
hello 1529
hello 1530
hello 1531
hello 1532
hello 1533
hello 1534
hello 1535
hello 1536
hello 1537
hello 1538
hello 1539
hello 1540
hello 1541
hello 1542
hello 1543
hello 1544
hello 1545
hello 1546
hello 1547
hello 1548
hello 1549
hello 1550
hello 1551
hello 1552
hello 1553
hello 1554
hello 1555
hello 1556
hello 1557
hello 1558
hello 1559
hello 1560
hello 1561
hello 1562
hello 1563
hello 1564
hello 1565
hello 1566
hello 1567
hello 1568
hello 1569
hello 1570
hello 1571
hello 1572
hello 1573
hello 1574
hello 1575
hello 1576
hello 1577
hello 1578
hello 1579
hello 1580
hello 1581

 85%|███████████████████████████████▎     | 4722/5569 [00:00<00:00, 8067.90it/s]

hello 3127
hello 3128
hello 3129
hello 3130
hello 3131
hello 3132
hello 3133
hello 3134
hello 3135
hello 3136
hello 3137
hello 3138
hello 3139
hello 3140
hello 3141
hello 3142
hello 3143
hello 3144
hello 3145
hello 3146
hello 3147
hello 3148
hello 3149
hello 3150
hello 3151
hello 3152
hello 3153
hello 3154
hello 3155
hello 3156
hello 3157
hello 3158
hello 3159
hello 3160
hello 3161
hello 3162
hello 3163
hello 3164
hello 3165
hello 3166
hello 3167
hello 3168
hello 3169
hello 3170
hello 3171
hello 3172
hello 3173
hello 3174
hello 3175
hello 3176
hello 3177
hello 3178
hello 3179
hello 3180
hello 3181
hello 3182
hello 3183
hello 3184
hello 3185
hello 3186
hello 3187
hello 3188
hello 3189
hello 3190
hello 3191
hello 3192
hello 3193
hello 3194
hello 3195
hello 3196
hello 3197
hello 3198
hello 3199
hello 3200
hello 3201
hello 3202
hello 3203
hello 3204
hello 3205
hello 3206
hello 3207
hello 3208
hello 3209
hello 3210
hello 3211
hello 3212
hello 3213
hello 3214
hello 3215
hello 3216
hello 3217

100%|█████████████████████████████████████| 5569/5569 [00:00<00:00, 7890.63it/s]

hello 4781
hello 4782
hello 4783
hello 4784
hello 4785
hello 4786
hello 4787
hello 4788
hello 4789
hello 4790
hello 4791
hello 4792
hello 4793
hello 4794
hello 4795
hello 4796
hello 4797
hello 4798
hello 4799
hello 4800
hello 4801
hello 4802
hello 4803
hello 4804
hello 4805
hello 4806
hello 4807
hello 4808
hello 4809
hello 4810
hello 4811
hello 4812
hello 4813
hello 4814
hello 4815
hello 4816
hello 4817
hello 4818
hello 4819
hello 4820
hello 4821
hello 4822
hello 4823
hello 4824
hello 4825
hello 4826
hello 4827
hello 4828
hello 4829
hello 4830
hello 4831
hello 4832
hello 4833
hello 4834
hello 4835
hello 4836
hello 4837
hello 4838
hello 4839
hello 4840
hello 4841
hello 4842
hello 4843
hello 4844
hello 4845
hello 4846
hello 4847
hello 4848
hello 4849
hello 4850
hello 4851
hello 4852
hello 4853
hello 4854
hello 4855
hello 4856
hello 4857
hello 4858
hello 4859
hello 4860
hello 4861
hello 4862
hello 4863
hello 4864
hello 4865
hello 4866
hello 4867
hello 4868
hello 4869
hello 4870
hello 4871




In [98]:
x

[array([-0.19119076,  0.22133897,  0.2271226 ,  0.17504238,  0.11460008,
        -0.43264902,  0.19432136,  0.63080853, -0.29876307, -0.21775757,
        -0.12147628, -0.4777606 , -0.07275569,  0.14642052,  0.13269964,
        -0.16655244,  0.05468165, -0.299429  , -0.0166351 , -0.50166774,
         0.16027299,  0.25352162,  0.18260291, -0.19501893, -0.05320387,
         0.07082383, -0.18657853, -0.21068238, -0.31254897,  0.1171634 ,
         0.2532131 , -0.08865194,  0.11054234, -0.12506737, -0.16561565,
         0.2684643 ,  0.09660012, -0.23142117, -0.0479495 , -0.43623394,
         0.04285815, -0.1033441 , -0.1649774 , -0.00697073,  0.22797492,
        -0.09198744, -0.16523744, -0.0835187 ,  0.08093892,  0.11999945,
         0.02075102, -0.17376861, -0.11822831,  0.00298972, -0.05121901,
         0.06925825,  0.19712849,  0.04941469, -0.33194727,  0.17786238,
        -0.02730161, -0.00974139,  0.0983442 , -0.07071862, -0.3143548 ,
         0.26750803,  0.10726368,  0.16302696, -0.2

In [99]:
len(x)

5569

In [101]:
# independant features
x_new = np.array(x)

  x_new = np.array(x)
