In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
from glove import Corpus, Glove

[nltk_data] Downloading package punkt to /home/ananthan2k/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ananthan2k/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ananthan2k/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Preprocessing

In [38]:
def normalize(text):
    return(text.lower())

In [39]:
import re
def removenumbers(text):
    text = re.sub("\d+", "", text)
    return text

In [40]:
def makesentences(text):
    s = re.sub("\n", " ", text)
    s = sent_tokenize(s)
    temp=[]
    for sen in s:
        temp.append(re.sub("\W", " ", sen))
    return temp

In [41]:
def preprocess(text):
    text = normalize(text)
    text = removenumbers(text)
    text = makesentences(text)
    return(text)

### Load the corpus

In [42]:
data=[]
path = './Datas/text.txt'
with open(path) as fd:    
    lines = fd.read()
    doc = preprocess(lines)
    data.append(doc)
    

In [43]:
sent= []
for doc in data:
    for s in doc:
        sent.append(s.split()) 
        
print(len(sent))

84257


In [26]:
corpus = Corpus() #Corpus Object

#create the co-occurrence matrix for text data with respect to a context window
corpus.fit(sent, window=10)

In [27]:
glove = Glove(no_components=25) #size of vectors

In [28]:
import time
start = time.time()
glove.fit(corpus.matrix, epochs=50, no_threads=4)## co-occ --> word embeddings
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
end = time.time()
end-start

81.27971506118774

In [29]:
glove.load('glove.model')

<glove.glove.Glove at 0x7f6164350040>

In [30]:
len(corpus.dictionary)

91796

In [31]:
glove.word_vectors[glove.dictionary['duchy']]

array([ 0.1067566 ,  0.22632935, -0.24461168,  0.30800894,  0.05424697,
       -0.13670989,  0.25378217, -0.27005755,  0.23821969, -0.03171011,
        0.08643283, -0.22801842, -0.11694864,  0.05465114,  0.56119682,
        0.11275145,  0.25527233, -0.13403117, -0.6364425 , -0.13954891,
       -0.24334096, -0.3057332 , -0.12278836,  0.21072148,  0.22428856])

In [33]:
glove.most_similar('mind')

[('memory', 0.886923611545967),
 ('personal', 0.8857053518695853),
 ('marriage', 0.884614581137182),
 ('successor', 0.8834417831414622)]

In [34]:
## Word2Vec

In [36]:
pip install gensim==4.0.1

Collecting gensim==4.0.1
  Downloading gensim-4.0.1-cp38-cp38-manylinux1_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 10.5 MB/s eta 0:00:01
[?25hCollecting smart-open>=1.8.1
  Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 11.6 MB/s eta 0:00:01
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.0.1 smart-open-5.2.1
Note: you may need to restart the kernel to use updated packages.


In [56]:
from gensim.models import Word2Vec, FastText

In [48]:
##Only once
import time
start = time.time()
cbow = Word2Vec(sent,vector_size = 50, window = 5) #sg=0 -CBoW - gensim 4
#cbow = Word2Vec(sent,size = 50, window = 5, sg = 0) #older version of gensim 3
end = time.time()
end-start

6.089433431625366

In [50]:
cbow.save("word2vec.model")

In [54]:
print(cbow.wv.similarity('duchy', 'prussia'))

0.83680445


In [55]:
### FastText

In [59]:
start = time.time()
fasttextmodel = FastText(window=3, min_count=1,sentences=sent)
fasttextmodel.save('fastText.model')
end = time.time()
end-start


38.93484115600586

In [61]:
fasttextmodel.wv.similarity('duchy', 'prussia')

0.8282014