In [5]:
import fasttext.util
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences

In [6]:
class TextToTensor:

    # --------------------------------------- Constructor --------------------------------------- 
    
    def __init__(self, tokenizer, max_len):
    
        self.tokenizer = tokenizer
        self.max_len = max_len

    
    def string_to_tensor(self, string_list: list) -> list:
        """
        A method to convert a string list to a tensor for a deep learning model
        """    
        string_list = self.tokenizer.texts_to_sequences(string_list)
        string_list = pad_sequences(string_list, maxlen=self.max_len)
        
        return string_list

In [7]:
class Embeddings:

    """
    A class to read the word embedding file and to create the word embedding matrix
    """

    def __init__(self, path, vector_dimension):
        self.path = path 
        self.vector_dimension = vector_dimension
    
    
    @staticmethod
    def get_coefs(word, *arr): 
        return word, np.asarray(arr, dtype='float32')

    
    def get_embedding_index(self):
        embeddings_index = dict(self.get_coefs(*o.split(" ")) for o in open(self.path, errors='ignore'))
        return embeddings_index

    
    def create_embedding_matrix(self, tokenizer, max_features):
        """
        A method to create the embedding matrix
        """
        model_embed = self.get_embedding_index()

        embedding_matrix = np.zeros((max_features + 1, self.vector_dimension))
        for word, index in tokenizer.word_index.items():
            if index > max_features:
                break
            else:
                try:
                    embedding_matrix[index] = model_embed[word]
                except:
                    continue
        
        return embedding_matrix


In [25]:
from gensim.models._fasttext_bin import load
from gensim.models import KeyedVectors

In [27]:
marathi_embeddings = KeyedVectors.load_word2vec_format('C:/Users/Amey/Desktop/Neural Networks/Data/Technodification/cc.mr.300.vec')

In [34]:
print((marathi_embeddings.wv['आहे']))

[-0.0233  0.056  -0.0185  0.0396 -0.011  -0.0608 -0.0087 -0.0886  0.0124
 -0.1066  0.0031 -0.0524 -0.0261 -0.0324  0.0053 -0.0085 -0.0133  0.0498
 -0.0614 -0.038   0.0037 -0.0232  0.0522  0.0014 -0.0146 -0.0235 -0.0196
  0.0005  0.0147 -0.0096 -0.0016 -0.0201  0.0071 -0.0342 -0.0091 -0.0208
  0.0331 -0.0564  0.03    0.0187  0.0193  0.0166 -0.0147 -0.0158 -0.0191
 -0.0029 -0.0837 -0.0334 -0.0188 -0.038  -0.0305  0.0477  0.039   0.0205
  0.0142  0.0436  0.0141 -0.041  -0.0942 -0.0151  0.0113  0.0272  0.0329
 -0.0221 -0.0119  0.0388 -0.0322 -0.0247  0.0071 -0.0251 -0.0068 -0.0084
 -0.0114 -0.0186 -0.0155 -0.0262  0.0136  0.0299  0.0745  0.005  -0.0231
  0.0093 -0.0141  0.0232 -0.1405 -0.0382 -0.05   -0.0241  0.0039  0.0321
 -0.0181  0.0463 -0.0487  0.2364 -0.0334 -0.0109  0.0289  0.058   0.0045
  0.0208  0.1884 -0.024  -0.0055 -0.1083  0.03   -0.0382 -0.0833  0.0132
  0.0075 -0.0228 -0.1422 -0.0304  0.0473 -0.004  -0.038   0.0031 -0.0714
  0.0186  0.0519 -0.022   0.0064 -0.0477 -0.0088 -0

  """Entry point for launching an IPython kernel.
