<a href="https://colab.research.google.com/github/Yonatan-max/MyFirstRepository/blob/master/Tig_Eng.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#creating model
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras import optimizers
from numpy import array
import pandas as pd
df=pd.read_table('text2notepad.txt',names=['English','Tigrigna'])
tig_eng = array(df)
tig_eng=tig_eng[:500:]
# function to build a tokenizer
def tokenization(lines):
      tokenizer = Tokenizer()
      tokenizer.fit_on_texts(lines)
      return tokenizer
# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)
# prepare english tokenizer
eng_tokenizer = tokenization(tig_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(tig_eng[:,0])
# prepare tigrigna tokenizer
tig_tokenizer = tokenization(tig_eng[:, 1])
tig_vocab_size = len(tig_tokenizer.word_index) + 1
tig_length =max_length(tig_eng[:,1])
def encode_sequences(tokenizer, length, lines):
         # integer encode sequences
         seq = tokenizer.texts_to_sequences(lines)
         # pad sequences with 0 values
         seq = pad_sequences(seq, maxlen=length, padding='post')
         return seq
# prepare training data
trainX = encode_sequences(tig_tokenizer, tig_length, tig_eng[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, tig_eng[:, 0])
# build NMT model
def define_model(in_vocab,out_vocab, in_timesteps,out_timesteps,units):
      model = Sequential()
      model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
      model.add(LSTM(units))
      model.add(RepeatVector(out_timesteps))
      model.add(LSTM(units, return_sequences=True))
      model.add(Dense(out_vocab, activation='softmax'))
      return model
# model compilation
model = define_model(tig_vocab_size, eng_vocab_size, tig_length, eng_length, 512)
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')
filename = 'tig_eng_model'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                    epochs=30, batch_size=512, validation_split = 0.2,callbacks=[checkpoint], 
                    verbose=1)

In [0]:
#train model
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from numpy import array
import pandas as pd
df=pd.read_table('text1notepad.txt',names=['English','Tigrigna'])
tig_eng = array(df)
#tig_eng=tig_eng[:500:]
# function to build a tokenizer
def tokenization(lines):
      tokenizer = Tokenizer()
      tokenizer.fit_on_texts(lines)
      return tokenizer
def max_length(lines):
	return max(len(line.split()) for line in lines)
# prepare english tokenizer
eng_tokenizer = tokenization(tig_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(tig_eng[:,0])
# prepare tigrigna tokenizer
tig_tokenizer = tokenization(tig_eng[:, 1])
tig_vocab_size = len(tig_tokenizer.word_index) + 1
tig_length =max_length(tig_eng[:,1])
def encode_sequences(tokenizer, length, lines):
         # integer encode sequences
         seq = tokenizer.texts_to_sequences(lines)
         # pad sequences with 0 values
         seq = pad_sequences(seq, maxlen=length, padding='post')
         return seq
# prepare training data
trainX = encode_sequences(tig_tokenizer, tig_length, tig_eng[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, tig_eng[:, 0])
model = load_model('tig_eng_model')
filename = 'tig_eng_model'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                    epochs=30, batch_size=512, validation_split = 0.2,callbacks=[checkpoint], 
                    verbose=1)

In [0]:
#using model from english to tigrigna
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from numpy import array,argmax
import pandas as pd
import numpy as np
# function to build a tokenizer
def create_tokenizer(lines):
      tokenizer = Tokenizer()
      tokenizer.fit_on_texts(lines)
      return tokenizer
def max_length(lines):
	return max(len(line) for line in lines)
 
def encode_sequences(tokenizer, length, lines):
         # integer encode sequences
         seq = tokenizer.texts_to_sequences(lines)
         # pad sequences with 0 values
         seq = pad_sequences(seq, maxlen=length, padding='post')
         return seq

df=pd.read_table('text3notepad.txt',names=['English','Tigrigna'])
# prepare english tokenizer
eng_tokenizer = create_tokenizer(df['English'].tolist())
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(df['English'].tolist())
# prepare data
trainX = encode_sequences(eng_tokenizer,eng_length,df['English'].tolist())
testX = encode_sequences(eng_tokenizer,eng_length,df['English'].tolist())
# load model
model = load_model('tig_eng_model')
testX=testX.reshape((testX.shape[0],testX.shape[1]))
source=[i[0:4] for i in testX]
source=array(source)
preds = model.predict_classes(source)
def get_word(n, tokenizer):
      for word, index in tokenizer.word_index.items():
          if index == n:
              return word
      return None
preds_text = []
for i in preds:
       temp = []
       for j in range(len(i)):
            t = get_word(i[j],tig_tokenizer)
            if j > 0:
                if (t == get_word(i[j-1],tig_tokenizer)) or (t == None):
                     temp.append('')
                else:
                     temp.append(t)
            else:
                   if(t == None):
                          temp.append('')
                   else:
                          temp.append(t) 

       preds_text.append(' '.join(temp))
pred_df =pd.DataFrame({'TobePred':df['English'],'actual' :df['Tigrigna'], 'predicted' : preds_text})
pred_df

In [0]:
#using model from tigrigna to english

from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from numpy import array,argmax
import pandas as pd
import numpy as np
# function to build a tokenizer
def create_tokenizer(lines):
      tokenizer = Tokenizer()
      tokenizer.fit_on_texts(lines)
      return tokenizer
def max_length(lines):
	return max(len(line) for line in lines)
 
def encode_sequences(tokenizer, length, lines):
         # integer encode sequences
         seq = tokenizer.texts_to_sequences(lines)
         # pad sequences with 0 values
         seq = pad_sequences(seq, maxlen=length, padding='post')
         return seq

df=pd.read_table('text3notepad.txt',names=['English','Tigrigna'])
# prepare tigrigna tokenizer
tig_tokenizer = create_tokenizer(df['Tigrigna'].tolist())
tig_vocab_size = len(tig_tokenizer.word_index) + 1
tig_length = max_length(df['Tigrigna'].tolist())
# prepare data
trainX = encode_sequences(tig_tokenizer,tig_length,df['Tigrigna'].tolist())
testX = encode_sequences(tig_tokenizer,tig_length,df['Tigrigna'].tolist())
# load model
model = load_model('tig_eng_model')
testX=testX.reshape((testX.shape[0],testX.shape[1]))
source=[i[0:4] for i in testX]
source=array(source)
preds = model.predict_classes(source)
def get_word(n, tokenizer):
      for word, index in tokenizer.word_index.items():
          if index == n:
              return word
      return None
preds_text = []
for i in preds:
       temp = []
       for j in range(len(i)):
            t = get_word(i[j],eng_tokenizer)
            if j > 0:
                if (t == get_word(i[j-1],eng_tokenizer)) or (t == None):
                     temp.append('')
                else:
                     temp.append(t)
            else:
                   if(t == None):
                          temp.append('')
                   else:
                          temp.append(t) 

       preds_text.append(' '.join(temp))
pred_df = pd.DataFrame({'TobePred':df['Tigrigna'],'actual' :df['English'], 'predicted' : preds_text})
pred_df