In [0]:
import io
import tensorflow as tf
import unicodedata
import re

def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w, lang):
  w = w.strip()
  if lang=="en":
    w = unicode_to_ascii(w.lower())

    # Adding space with punctuation.
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # Removing everything except(letters)
    w = re.sub(r"[^a-z]+", " ", w)

  w = w.strip()
  w = '<start> ' + w + ' <end>'
  return w

def create_dataset(path, num_examples=None):
  # This method is called only on unaligned samples. 
  # Since english is already lowered and punctuations are removed, preprocess it as french samples
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
  word_pairs = [[preprocess_sentence(w, lang="fr") for w in l.split('\t')]  for l in lines[:num_examples]]
  return zip(*word_pairs)

def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=' ', lower=False)
  lang_tokenizer.fit_on_texts(lang)
  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
  return tensor, lang_tokenizer

In [2]:
en, fr = create_dataset("./pairs_en_fr.txt")
input_tensor, inp_lang_tokenizer = tokenize(list(en))
target_tensor, targ_lang_tokenizer = tokenize(list(fr))
print("English Aligned Vocab Size", str(len(inp_lang_tokenizer.index_word)))
print("French Aligned Vocab Size", str(len(targ_lang_tokenizer.index_word)))

English Aligned Vocab Size 13659
French Aligned Vocab Size 18237


In [3]:
sent = preprocess_sentence("My nàme is Akshay", lang="fr")
sent

'<start> My nàme is Akshay <end>'

In [0]:
lang = "en"
class MyIter(object):
  def __iter__(self):
    count = 0
    path = "unaligned."+lang
    with open(path, 'r', encoding='utf-8') as fin:
      for line in fin:
        count+=1
        sentence = preprocess_sentence(line, lang=lang)
        if count%100000==0:
          print(count)
        yield sentence.split(" ")


In [16]:
from gensim.utils import tokenize
from gensim import utils
from gensim.models import FastText, Word2Vec
from gensim.test.utils import datapath

model = Word2Vec(size=128, window=3, min_count=1)
model.build_vocab(sentences=MyIter())
total_examples = model.corpus_count
len(model.wv.vocab)

100000
200000
300000
400000


55409

In [17]:
model.train(sentences=MyIter(), total_examples=total_examples, epochs=5)

100000
200000
300000
400000
100000
200000
300000
400000
100000
200000
300000
400000
100000
200000
300000
400000
100000
200000
300000
400000


(33218059, 48078730)

In [0]:
word_vectors_en = model.wv
import pickle
with open("word_vectors_en.pkl", 'wb') as f:
  pickle.dump(word_vectors_en, f)

In [0]:
word_vectors_fr = model.wv
import pickle
with open("word_vectors_fr.pkl", 'wb') as f:
  pickle.dump(word_vectors_fr, f)

In [0]:
def get_embedding_matrix(tokenizer, word_vector, emb_size=128):
  vocab_size = len(tokenizer.word_index)+1
  embedding_matrix = np.zeros((vocab_size,emb_size), dtype="float32")
  for word,index in tokenizer.word_index.items():
    embedding_matrix[index] = word_vector[word] if word in word_vector else np.zeros(emb_size, dtype="float32")
  return embedding_matrix

In [20]:
import numpy as np
emb_en = get_embedding_matrix(inp_lang_tokenizer, word_vectors_en)
emb_en.shape

(13660, 128)

In [0]:
with open("emb_en_128.pkl", 'wb') as f:
  pickle.dump(emb_en, f)

In [13]:
import numpy as np
emb_fr = get_embedding_matrix(targ_lang_tokenizer, word_vectors_fr)
emb_fr.shape

(18238, 128)

In [0]:
with open("emb_fr_128.pkl", 'wb') as f:
  pickle.dump(emb_fr, f)

In [32]:
# from gensim.test.utils import get_tmpfile
# fname = get_tmpfile("fasttext_en.model")

model.save("w2v_v150_400k_en.model")
# model = FastText.load(fname)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
model = FastText.load("fname")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


FileNotFoundError: ignored