In [1]:
import numpy as np
import re
import pandas as pd

In [2]:
def get_vocab(path):
    file = open(path, "r", encoding="utf-8")
    vocab = []
    for word in file:
        vocab.append(re.sub("\n", "", word))
    file.close()
    print(len(vocab))
    return vocab

In [12]:
def get_embeddings(path):
    embedding_dict = dict()
    file = open(path, "r")
    for row in file:
        row = row.split()
        embedding_dict[row[0]] = np.array(row[1:], dtype='float32')
    file.close()
    return embedding_dict

In [27]:
def write_to_file(path, embedding_dict, vocab):
    file = open(path, "w")
    for word in vocab:
        wv_string = ""
        for vi in embedding_dict.get(word):
            wv_string = wv_string + " " + str(vi)
        wv_string = word + " " + wv_string + "\n"
        file.write(wv_string)
    file.close()

In [14]:
ds_path = "/home/eastwind/word-embeddings/fasttext/TechDofication.mr.raw.complete.ft.skipgram.new.d300.vec"
ds_embedding_dict = get_embeddings(ds_path)
len(ds_embedding_dict)

55141

In [15]:
indic_path = "/home/eastwind/word-embeddings/fasttext/indicnlp.ft.mr.300.vec"
indic_embedding_dict = get_embeddings(indic_path)
len(indic_embedding_dict)

258414

In [9]:
ft_path = "/home/eastwind/word-embeddings/fasttext/cc.mr.300.vec"
ft_embedding_dict = get_embeddings(ft_path)
len(ft_embedding_dict)

922815

In [16]:
ds_words = set(ds_embedding_dict.keys())
indic_words = set(indic_embedding_dict.keys())
#ft_words = set(ft_embedding_dict.keys())
print("\nTotal words in domain Specific embeddings: ", len(ds_words))
print("\nTotal words in Indic-fasttext embeddings: ", len(indic_words))
#print("\nTotal words in fasttext embeddings: ", len(ft_words))


Total words in domain Specific embeddings:  55141

Total words in Indic-fasttext embeddings:  258414


# Intersection Vocabulary

In [22]:
ds_indic_intersect = ds_words & indic_words
#ds_ft_intersect = ds_words & ft_words
#ft_indic_intersect = ft_words & indic_words
print("\nDS and indic-fasttext embeddings intersection: ", len(ds_indic_intersect))
#print("\nDS and fasttext embeddings intersection: ", len(ds_ft_intersect))
#print("\nfasttext and indic-fasttext embeddings intersection: ", len(ft_indic_intersect))


DS and indic-fasttext embeddings intersection:  30691


In [None]:
write_to_file("vocabulary/ds-indic-intersect.txt", ds_indic_intersect)
write_to_file("vocabulary/ds-ft-intersect.txt", ds_ft_intersect)
write_to_file("vocabulary/ft-indic-intersect.txt", ft_indic_intersect)

# Union Vocabulary

In [23]:
ds_indic_union = ds_words | indic_words
#ds_ft_union = ds_words | ft_words
#ft_indic_union = ft_words | indic_words
print("\nDS and indic-fasttext embeddings Union: ", len(ds_indic_union))
#print("\nDS and fasttext embeddings Union: ", len(ds_ft_union))
#print("\nfasttext and indic-fasttext embeddings Union: ", len(ft_indic_union))


DS and indic-fasttext embeddings Union:  282864


In [None]:
write_to_file("vocabulary/ds-indic-union.txt", ds_indic_union)
write_to_file("vocabulary/ds-ft-union.txt", ds_ft_union)
write_to_file("vocabulary/ft-indic-union.txt", ft_indic_union)

# Minus Vocabulary

In [17]:
indic_minus_ds = indic_words - ds_words
#ft_minus_indic_plus_ds = ft_words - indic_minus_ds
print("\nindic-fasttext minus DS vocab: ", len(indic_minus_ds))
#print("\nFasttext minus indic-fasttext plus DS vocab: ", len(indic_minus_ds))


indic-fasttext minus DS vocab:  227723


In [None]:
for word in indic_words:
    if word not in ds_words:
        ds_embedding_dict[word] = np.array(indic_embedding_dict.get(word), dtype='float32')


In [20]:
len(ds_embedding_dict)

282864

In [None]:
write_to_file("vocabulary/ds-minus-indic.txt", ds_minus_indic)
write_to_file("vocabulary/indic-minus-ds.txt", indic_minus_ds)

In [29]:
write_to_file("vocabulary/indic_plus_ds.vec", ds_embedding_dict, ds_indic_union)

In [30]:
emb = get_embeddings("vocabulary/indic_plus_ds.vec")
len(emb)

282864