In [1]:
from pickle import load
import tensorflow as tf
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.callbacks import ModelCheckpoint
import re

In [2]:
def preprocess(file):
  data = []
  with open(file,'r',encoding='utf-8') as f:
    lines = f.read()
    lines = lines.strip().split('</s>')
    for i in lines:
      line = re.sub("\n.\n',",'.',i)
      line = re.sub('\n<s>.','',line)
      line = re.sub('\n<s>','',line)
      line = re.sub('\n',' ',line)
      line = re.sub('<s>','',line)
      line = line.lstrip()
      line = line.rstrip()
      data.append(line)
  return data

In [3]:
source ='/content/train-source.txt' 
target = '/content/train-target.txt'


In [4]:
target_data = array(preprocess(target))
source_data = array(preprocess(source))

In [5]:
source_data.shape == target_data.shape

True

Heavily taken from: https://github.com/AarohiSingla/Language-translator-using-seq2seq-model/blob/main/seq2seq_full_code_algo.ipynb

In [6]:
#source_data = source_data[:20000]
#source_data.shape
def tokenization(data):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(data)
  return tokenizer

source_tokenizer = tokenization(source_data)
source_vocab_size = len(source_tokenizer.word_index)+1

target_tokenizer = tokenization(target_data)
target_vocab_size = len(target_tokenizer.word_index)+1

In [7]:
def encode_sequences(tokenizer, length, lines):
	# integer encode sequences
	X = tokenizer.texts_to_sequences(lines)
	# pad sequences with 0 values
	X = pad_sequences(X, maxlen=length, padding='post')
	return X

In [14]:
def encode_output(sequences, vocab_size):
	ylist = list()
	for sequence in sequences:
		encoded = to_categorical(sequence, num_classes=vocab_size)
		ylist.append(encoded)
	y = array(ylist)
	y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
	return y

In [8]:
input_characters = set()
target_characters = set()
for line in source_data:
  for word in line:
    for char in word:
      if char not in input_characters:
        input_characters.add(char)
for line in target_data:
  for word in line:
    for char in word:
      if char not in target_characters:
        target_characters.add(char)

In [9]:
input_texts = source_data
target_texts = target_data

In [10]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [11]:
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 45172
Number of unique input tokens: 107
Number of unique output tokens: 94
Max sequence length for inputs: 1190
Max sequence length for outputs: 1113


In [12]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])


In [None]:
def create_tokenizer(lines):
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(lines)
	return tokenizer

In [10]:
sor_tokenizer = create_tokenizer(source_data[:])
sor_vocab_size = len(sor_tokenizer.word_index) + 1
sor_length = max_encoder_seq_length(source_data[:])
print('English Vocabulary Size: %d' % sor_vocab_size)
print('English Max Length: %d' % (sor_length))

tar_tokenizer = create_tokenizer(target_data[:])
tar_vocab_size = len(tar_tokenizer.word_index) + 1
tar_length = max_decoder_seq_length(target_data[:])
print('English Vocabulary Size: %d' % tar_vocab_size)
print('English Max Length: %d' % (tar_length))

English Vocabulary Size: 27497
English Max Length: 231
English Vocabulary Size: 24968
English Max Length: 221


In [12]:
trainX = encode_sequences(tar_tokenizer, tar_length,source_data)

In [13]:
trainY = encode_sequences(sor_tokenizer, sor_length, source_data)

In [None]:
trainY = encode_output(trainY, sor_vocab_size)

In [None]:
testX = encode_sequences(tar_tokenizer, tar_length, target_data)

In [None]:
testY = encode_sequences(sor_tokenizer, sor_length, target_data)

In [None]:
testY = encode_output(testY, sor_vocab_size)

In [11]:
from sklearn.model_selection import train_test_split
trainX,trainY,testX,testY = train_test_split(source_data,target_data,test_size=0.2)

In [13]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model

In [None]:
model = define_model(sor_vocab_size, tar_vocab_size, sor_length, tar_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), verbose=2)