In [5]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [6]:
%cd /content/gdrive/MyDrive/NLP2021/WordEmbeddings/

/content/gdrive/MyDrive/NLP2021/WordEmbeddings


# GloVe
You may find an implemention of GloVe here:

https://pypi.org/project/glove-python-binary/ - must be used in the Python 3.5 and above

https://github.com/maciejkula/glove-python


Ref:

https://medium.com/analytics-vidhya/word-vectorization-using-glove-76919685ee0b

In [7]:
pip install glove-python-binary



#There two major tasks in GloVe

*   creating a co-occurrence matrix from the corpus, 
*   using it to produce the embeddings

#These are achieved by two classes:

1.   Corpus :  Given a corpus, it constructs vocabulary and co-occurrence matrix

2.   Glove : trains the embeddings



In [8]:
from glove import Corpus, Glove

#Pre-processing Functions

In [9]:
def normalize(text):
    return(text.lower())

In [10]:
import re
def removenumbers(text):
    text = re.sub("\d+", "", text)
    return text

In [11]:
import nltk
nltk.download('punkt')
from nltk import sent_tokenize

def makesentences(text):
    s = re.sub("\n", " ", text)
    s = sent_tokenize(s)
    temp=[]
    for sen in s:
        temp.append(re.sub("\W", " ", sen))
    return temp

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
def preprocess(text):
    text = normalize(text)
    text = removenumbers(text)
    text = makesentences(text)
    return(text)

#Load Corpus

In [13]:
#The Mahabharata corpus is taken form the following website:
#https://www.sacred-texts.com/hin/maha/ 
data=[]
n=19 #18 Texts of Mahabharatha
for i in range(1,n):
    path="mb/maha"+str(i) + '.txt'
    #print(path)
    file = open(path)
    lines = file.read()
    doc = preprocess(lines)
    data.append(doc)

In [14]:
sent= []
for doc in data:
    for s in doc:
        sent.append(s.split()) 
        
print(len(sent))
print(sent[10000])

131541
['then', 'dhaumya', 'well', 'conversant', 'with', 'the', 'vedas', 'igniting', 'the', 'sacred', 'fire', 'poured', 'with', 'due', 'mantras', 'libations', 'of', 'clarified', 'butter', 'into', 'that', 'blazing', 'element']


#Create the GloVe Model

The corpus.fit() takes two arguments:

1.   lines — text after pre-processing
2.   window — context window

In [15]:
corpus = Corpus() #Corpus Object

#create the co-occurrence matrix for text data with respect to a context window
corpus.fit(sent, window=10)

##corpus --> co-occ Mat


The Glove() constructor takes 

1.   no_of_components — size of the word vectors that are created
2.   learning_rate - machine learning parameter - learning rate


In [16]:
#Golve object
glove = Glove(no_components=25) #size of vectors

The glove.fit() takes:

1.   cooccurence_matrix: the matrix of word-word co-occurrences
2.   epochs: number of times the dataset is processed
3. no_of_threads: number of threads for parallel processing







In [17]:
#only once
import time
start = time.time()
glove.fit(corpus.matrix, epochs=50, no_threads=4)## co-occ --> word embeddings
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
end = time.time()
end-start

130.40521621704102

In [25]:
glove.load('glove.model')

<glove.glove.Glove at 0x7f629f54ad10>

In [19]:
len(corpus.dictionary)

31681

In [20]:
glove.word_vectors[glove.dictionary['krishna']]

array([-3.50014510e-01,  5.41537420e-01, -4.24498644e-01,  6.48002732e-03,
       -4.43351357e-01,  4.16083526e-01,  3.74757858e-01, -4.71215809e-01,
       -3.00953196e-01, -3.61419186e-01,  4.15034334e-01,  2.41445370e-01,
       -3.69636749e-01, -3.48069620e-01, -2.69791700e-01, -2.43678201e-01,
        4.61206070e-04, -9.94493635e-02,  5.22362240e-01,  3.08452243e-01,
        3.43109478e-01, -1.89536383e-01, -9.25948038e-02,  2.86891579e-01,
        5.32442909e-01])

In [21]:
glove.most_similar('bharata')

[('race', 0.9458372835902824),
 ('bull', 0.9081990989694315),
 ('o', 0.9071518455445287),
 ('kuru', 0.89864954430852)]

In [22]:
D = {word: glove.word_vectors[glove.dictionary[word]] for word in glove.dictionary.keys()}

In [23]:
#D

In [24]:
D['arjuna']

array([-0.39440406,  0.17652099, -0.58055535,  0.08393565, -0.68733484,
        0.28464127,  0.41744381, -0.30537896, -0.08472741, -0.34112128,
        0.35107387,  0.25535321, -0.22785611, -0.39854562, -0.3099981 ,
       -0.39314134, -0.33393178, -0.36968778,  0.44259752,  0.19432483,
        0.2186686 , -0.52209415, -0.45917745,  0.52118736,  0.68552378])

Save the dictionary to file and follow the same process as used in pre-trained model