In [31]:
import fasttext.util
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from gensim.models.fasttext import FastText 
from gensim.models import LsiModel
import random
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
from time import process_time
import errno,pickle
import numpy as np
import codecs

In [26]:
class TextToTensor:

    # --------------------------------------- Constructor --------------------------------------- 
    
    def __init__(self, tokenizer, max_len):
    
        self.tokenizer = tokenizer
        self.max_len = max_len

    
    def string_to_tensor(self, string_list: list) -> list:
        """
        A method to convert a string list to a tensor for a deep learning model
        """    
        string_list = self.tokenizer.texts_to_sequences(string_list)
        string_list = pad_sequences(string_list, maxlen=self.max_len)
        
        return string_list

In [118]:
class Embeddings:

    """
    A class to read the word embedding file and to create the word embedding matrix
    """

#     def __init__(self, path, vector_dimension):
#         self.path = path 
#         self.vector_dimension = vector_dimension
    
    
    @staticmethod
    def get_coefs(word, *arr): 
        return word, np.asarray(arr, dtype='float32')

    
    def get_embedding_index(self):
        embeddings_index = dict(self.get_coefs(*o.split(" ")) for o in open(self.path, errors='ignore'))
        return embeddings_index

    
    def create_embedding_matrix(self, tokenizer, max_features):
        """
        A method to create the embedding matrix
        """
        model_embed = self.get_embedding_index()

        embedding_matrix = np.zeros((max_features + 1, self.vector_dimension))
        for word, index in tokenizer.word_index.items():
            if index > max_features:
                break
            else:
                try:
                    embedding_matrix[index] = model_embed[word]
                except:
                    continue
        
        return embedding_matrix
    
    def save_embeddings(self,model,filepath):
        
        if (".vec" in filepath or ".txt" in filepath):
            file = open(filepath, "w",encoding= 'utf-8')
            words = model.keys()
            cnt = 0
            for w in words:
                v = model[w]
                vstr = ""
                for value in v:
                    vstr += " " + str(value)
                try:
                    row = w + vstr + "\n"
                    file.write(row)
                    cnt += 1
                except Exception as e:
                    print('Exception: ',e)
#                     if e.errno == errno.EPIPE:
#                         pass
            print('Words processed: ',cnt)

        
        elif ".plk" in filepath:
            with open(filepath,'wb') as file:
                pickle.dump(embeddings_dict, file, pickle.HIGHEST_PROTOCOL)
        
        else:
            print('Invalid File type')
    
    def load_embeddings(self,filepath):
        
        if (".vec" in filepath or ".txt" in filepath):
            print("Loading Model")
            f = open(filepath,'r',encoding='utf8')
            model = {}

            for line in f:
                splitLines = line.split()
                word = splitLines[0]
                wordEmbedding = np.array([float(value) for value in splitLines[1:]])
                model[word] = wordEmbedding
            print(len(model)," words loaded.")

            return model
        
        elif('.plk' in filepath):
            print("Loading Model")
            f = open(filepathp,'rb',encoding='utf8')
            model = pickle.load(f)
            print(len(model.keys())," words loaded.")
            
            return model
        
        else:
            return None
    
    def concatenate_embeddings(self,embedding_dict1, embedding_dict2, intersection_only = True):
        
        embedding_num1 = len(random.choice(list(embedding_dict1.values())))
        embedding_num2 = len(random.choice(list(embedding_dict2.values()))) 
        word_set1 = set(embedding_dict1.keys())
        word_set2 = set(embedding_dict2.keys())
        print("Input Details: \nSet1: ({},{})\nSet2: ({},{})".format(len(word_set1),embedding_num1, len(word_set2),embedding_num2))
        concatenated_matrix = np.empty([0,embedding_num1 + embedding_num2])

#         count = 0
        
        if(intersection_only == True):
            vocab = word_set1.intersection(word_set2)
            print('Common Vocab:',len(vocab))
            
            for word in vocab:
                #print('Accessing word:',word)
                vec1 = embedding_dict1[word]
                vec2 = embedding_dict2[word]
                vec_conc = np.concatenate((vec1,vec2))

                concatenated_matrix = np.vstack((concatenated_matrix,vec_conc))
#                 count += 1
#                 if(count == 10): break
                    
            print('\nOutput Shape: ',(concatenated_matrix.shape))
        
        else:
            vocab = word_set1.union(word_set2)
            common_v = word_set1.intersection(word_set2)
            
            for word in vocab:
                
                if not word in common_v:
                    # Logic for projecting embeddings 
                    vec_conc = np.zeros([0,embedding_num1 + embedding_num2])
                    concatenated_matrix = np.vstack((concatenated_matrix,vec_conc))
                
                else:
                    vec1 = embedding_dict1['word']
                    vec2 = embedding_dict2['word']
                    vec_conc = np.concatenate((vec1,vec2))
                    concatenated_matrix = np.vstack((concatenated_matrix,vec_conc))
                    
        return concatenated_matrix, list(vocab)
    
    
    def perform_SVD(self,matrix, num_components = 2):
        
            print('Input Shape: {}\n'.format(matrix.shape))
            
            svd = TruncatedSVD(n_components = num_components)
            new_matrix = svd.fit_transform(matrix)
            
            print('Output Shape: {}\n'.format(new_matrix.shape))
            
            return new_matrix

In [119]:
embed = Embeddings()
# dict1 = {'word1': [1] ,'word2' :[2] ,'word3' :[3]}
# dict2 = {'word1': [1] ,'word2' :[2] ,'word4' :[3]}

In [29]:
indicnlp_embeddings_300 = embed.load_embeddings('../Technodifacation/Embeddings/indicnlp.ft.mr.300.vec')
ft_embeddings_300 = embed.load_embeddings('../Technodifacation/models/DS_fasttext_skipgram_cleaned_300.vec')

Loading Model
258414  words loaded.
Loading Model
51954  words loaded.


In [125]:
start = process_time()
concatenation_matrix, vocab_list = embed.concatenate_embeddings(ft_embeddings_300,indicnlp_embeddings_300)
end = process_time()
print("Total time taken: ", end-start)

Input Details: 
Set1: (51954,300)
Set2: (258414,300)
Common Vocab: 29252

Output Shape:  (29252, 600)
Total time taken:  1609.703125


In [126]:
concatenation_matrix[0]

array([ 2.32528950e-01, -1.65739860e-01,  5.56280600e-01, -3.37442000e-01,
       -4.51513500e-02,  1.17013790e-01, -2.34878110e-01, -6.37767900e-02,
       -1.60344100e-01, -3.24652080e-01,  5.03753300e-01,  1.54610350e-02,
       -1.27253920e-01,  5.08804860e-01, -1.34303140e-01,  1.40366990e-01,
        3.53206220e-01, -3.72080800e-01,  2.22945630e-01, -8.42555500e-01,
       -9.86273600e-02, -2.30949060e-01,  6.77504060e-01, -2.67454830e-01,
        2.27807800e-02,  1.85186340e-02, -2.08578970e-01,  1.84468730e-01,
       -4.04474380e-01,  1.14497580e-01,  3.78842200e-01, -2.10898160e-01,
        4.28036180e-01, -1.51156830e-02,  2.64869900e-03, -1.37737150e-01,
       -4.26138380e-02,  4.46910140e-01, -3.89181440e-01,  4.50543000e-01,
        1.26009350e-01,  8.41780840e-01, -4.10878960e-02, -2.87822430e-01,
        8.42054560e-02,  2.38345830e-01, -1.14306370e-01,  2.48483580e-01,
       -2.56841200e-01,  2.49823450e-01, -3.76220520e-01,  1.07442334e-01,
       -3.05283960e-02, -

In [127]:
# Saving concatenated vectors

embed_dict = {}
for index in range(concatenation_matrix.shape[0]):
    embed_dict[vocab_list[index]] = concatenation_matrix[index] 
                   
embed.save_embeddings(embed_dict,'../Technodifacation/Embeddings/indic_300.ft_clean_300_concatenated.vec')

Words processed:  29252


In [128]:
# Performing SVD on concatenated matrix (600 -> 300)

svd = embed.perform_SVD(concatenation_matrix,num_components=300)

Input Shape: (29252, 600)

Output Shape: (29252, 300)



In [130]:
# Saving Results

svd_dict = {}
for index in range(svd.shape[0]):
    svd_dict[vocab_list[index]] = svd[index] 
                   
embed.save_embeddings(svd_dict,'../Technodifacation/Embeddings/SVD_indic_300.ft_clean_300_concatenated.vec')

Words processed:  29252


In [124]:
# Checking saved files

svd_embeddings = embed.load_embeddings('../Technodifacation/Embeddings/SVD_ft_clean_300.indic_300_concatenated.vec')
len(svd_embeddings)

Loading Model
29252  words loaded.


29252