In [1]:
import collections
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [2]:
import os

def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with open(input_file, "r", encoding ='utf=8') as f:
        data = f.read()

    return data.split('\n')


from keras.losses import sparse_categorical_crossentropy
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical


def _test_model(model, input_shape, output_sequence_length, french_vocab_size):
    if isinstance(model, Sequential):
        model = model.model

    assert model.input_shape == (None, *input_shape[1:]),'Wrong input shape. Found input shape {} using parameter input_shape={}'.format(model.input_shape, input_shape)

    assert model.output_shape == (None, output_sequence_length, french_vocab_size),'Wrong output shape. Found output shape {} using parameters output_sequence_length={} and french_vocab_size={}'.format(model.output_shape, output_sequence_length, french_vocab_size)

    assert len(model.loss_functions) > 0,'No loss function set.  Apply the `compile` function to the model.'

    assert sparse_categorical_crossentropy in model.loss_functions,'Not using `sparse_categorical_crossentropy` function for loss.'


def test_tokenize(tokenize):
    sentences = [
        'আমি এই উপন্যাস আগেও পড়েছি।',
        'টম খুব খোলামেলা মানুষ।',
        'তুমি কি কখনো হেলিকপ্টারে বসেছো?']
    tokenized_sentences, tokenizer = tokenize(sentences)
    assert tokenized_sentences == tokenizer.texts_to_sequences(sentences),\
        'Tokenizer returned and doesn\'t generate the same sentences as the tokenized sentences returned. '


def test_pad(pad):
    tokens = [
        [i for i in range(4)],
        [i for i in range(6)],
        [i for i in range(3)]]
    padded_tokens = pad(tokens)
    padding_id = padded_tokens[0][-1]
    true_padded_tokens = np.array([
        [i for i in range(4)] + [padding_id]*2,
        [i for i in range(6)],
        [i for i in range(3)] + [padding_id]*3])
    assert isinstance(padded_tokens, np.ndarray),\
        'Pad returned the wrong type.  Found {} type, expected numpy array type.'
    assert np.all(padded_tokens == true_padded_tokens), 'Pad returned the wrong results.'

    padded_tokens_using_length = pad(tokens, 9)
    assert np.all(padded_tokens_using_length == np.concatenate((true_padded_tokens, np.full((3, 3), padding_id)), axis=1)),\
        'Using length argument return incorrect results'


def test_simple_model(simple_model):
    input_shape = (80000, 17, 1)
    output_sequence_length = 17
    english_vocab_size = 12201
    french_vocab_size = 14157

    model = simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size)
    _test_model(model, input_shape, output_sequence_length, french_vocab_size)


def test_embed_model(embed_model):
    input_shape = (200000, 17)
    output_sequence_length = 17
    english_vocab_size = 12201
    french_vocab_size = 14157

    model = embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size)
    _test_model(model, input_shape, output_sequence_length, french_vocab_size)


def test_encdec_model(encdec_model):
    input_shape = (200000, 17, 1)
    output_sequence_length = 17
    english_vocab_size = 12201
    french_vocab_size = 14157

    model = encdec_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size)
    _test_model(model, input_shape, output_sequence_length, french_vocab_size)


def test_bd_model(bd_model):
    input_shape = (200000, 17, 1)
    output_sequence_length = 17
    english_vocab_size = 12201
    french_vocab_size = 14157

    model = bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size)
    _test_model(model, input_shape, output_sequence_length, french_vocab_size)


def test_model_final(model_final):
    input_shape = (200000, 17)
    output_sequence_length = 17
    english_vocab_size = 12201
    french_vocab_size = 14157

    model = model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size)
    _test_model(model, input_shape, output_sequence_length, french_vocab_size)

In [5]:
english_sentences = load_data('/content/drive/My Drive/NLP_csv/80kban.txt')
french_sentences = load_data('/content/drive/My Drive/NLP_csv/80kger.txt')
print('Dataset Loaded')

Dataset Loaded


In [6]:
for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en Line 1:  যান.
small_vocab_fr Line 1:  Geh.
small_vocab_en Line 2:  নমস্কার!
small_vocab_fr Line 2:  Hallo!


In [7]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])
print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

335708 English words.
14101 unique English words.
10 Most common words in the English dataset:
"আমি" "টম" "এটা" "কি" "তুমি" "আমার" "না।" "তোমার" "সে" "একটা"

356660 French words.
23231 unique French words.
10 Most common words in the French dataset:
"Tom" "Ich" "ist" "Sie" "nicht" "das" "du" "Das" "hat" "Er"


In [8]:
def tokenize(x):
    x_tk = Tokenizer(char_level = False)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk
text_sentences = [
    'আমি এই উপন্যাস আগেও পড়েছি।',
    'টম খুব খোলামেলা মানুষ।',
    'তুমি কি কখনো হেলিকপ্টারে বসেছো?']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'আমি': 1, 'এই': 2, 'উপন্যাস': 3, 'আগেও': 4, 'পড়েছি।': 5, 'টম': 6, 'খুব': 7, 'খোলামেলা': 8, 'মানুষ।': 9, 'তুমি': 10, 'কি': 11, 'কখনো': 12, 'হেলিকপ্টারে': 13, 'বসেছো': 14}

Sequence 1 in x
  Input:  আমি এই উপন্যাস আগেও পড়েছি।
  Output: [1, 2, 3, 4, 5]
Sequence 2 in x
  Input:  টম খুব খোলামেলা মানুষ।
  Output: [6, 7, 8, 9]
Sequence 3 in x
  Input:  তুমি কি কখনো হেলিকপ্টারে বসেছো?
  Output: [10, 11, 12, 13, 14]


In [9]:
# import project_tests as tests
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')
test_pad(pad)
# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 3 4 5]
  Output: [1 2 3 4 5]
Sequence 2 in x
  Input:  [6 7 8 9]
  Output: [6 7 8 9 0]
Sequence 3 in x
  Input:  [10 11 12 13 14]
  Output: [10 11 12 13 14]


In [10]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sentences, french_sentences)    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)
print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 17
Max French sentence length: 17
English vocabulary size: 12201
French vocabulary size: 14157


In [11]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])
print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


In [12]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-2
    input_seq = Input(input_shape[1:])
    rnn = GRU(128, return_sequences = True)(input_seq)
    logits = TimeDistributed(Dense(french_vocab_size))(rnn)
    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model
# test_simple_model(simple_model)
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))
# Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size+1,
    french_vocab_size+1)
simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=100, epochs=30, validation_split=0.2)
# Print prediction(s)
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
ist ist <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [13]:
def final_predictions(x, y, x_tk, y_tk):
    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'
    sentence = ''
    sentence = [x_tk.word_index[word] for word in sentence.split()]
    debug1 = sentence
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    debug2 = sentence
    sentences = np.array([sentence[0], x[0]])
    debug3 = sentences
    predictions = simple_rnn_model.predict(sentences, len(sentences))
    debug4 = predictions
    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    print('ওহে')
    print('Sample 2:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    print(' '.join([y_id_to_word[np.max(x)] for x in y[1209]]))
    a = []
    for i in range(0,len(french_sentences)):
        debug5 = " ".join([y_id_to_word[np.max(x)] for x in y[i]])
        if i  == 1:
            debug4 = debug5
        a.append(debug5)        
    from pandas import DataFrame
    df = DataFrame(a,columns=["predicted string"])
    df["predicted string"]= df["predicted string"].str.replace("<PAD>", "", case = False) 
    df["actual language"] = french_sentences

    
    return debug1, debug2, debug3, debug4, debug5, a, df
debug1, debug2, debug3, debug4, debug5,  a, df = final_predictions(preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer)
#df.to_csv("jekhane khushi save kore ne sagol.csv")

Sample 1:
ich bin’s <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
ওহে
Sample 2:
ist ist <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
prüfen sie das nach <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [14]:
df_test = df

In [15]:
remove_characters = ["?", ".","!",","]

for c in remove_characters:
    df_test["actual language"] =  df_test["actual language"].str.replace(c,"")

df_test["actual language"] = df_test["actual language"] .str.lower()

In [16]:
df_test

Unnamed: 0,predicted string,actual language
0,geh,geh
1,hallo,hallo
2,grüß gott,grüß gott
3,lauf,lauf
4,lauf,lauf
...,...,...
79995,ich habe nicht so viel mut wie ihr,ich habe nicht so viel mut wie ihr
79996,das habe ich nicht vor,das habe ich nicht vor
79997,ich weiss toms nummer nicht,ich weiss toms nummer nicht
79998,ich weiß nicht wie ich das gemacht habe,ich weiß nicht wie ich das gemacht habe


In [17]:
col_1 = df['predicted string'].tolist()
col_2 = df["actual language"].tolist()


from nltk.translate.bleu_score import corpus_bleu
#reference = [['this', 'is', 'a', 'test'], ['this', 'is' 'test']]
#candidate = ['this', 'is', 'a', 'test']
score1 = corpus_bleu(col_2, col_1, weights=(1, 0, 0, 0))
score2 = corpus_bleu(col_2, col_1, weights=(0.5, 0.5, 0, 0))
score3 = corpus_bleu(col_2, col_1, weights=(0.33, 0.33, 0.33, 0))
score4 = corpus_bleu(col_2, col_1, weights=(0.25, 0.25, 0.25, 0.25))
score21 = corpus_bleu(col_2, col_1, weights=(0.5, 0.5, 0, 0))
score31 = corpus_bleu(col_2, col_1, weights=(0.33, 0.33, 0.33, 0))
score41 = corpus_bleu(col_2, col_1, weights=(0.25, 0.25, 0.25, 0.25))
print(score1)
print(score2)
print(score3)
print(score4)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.3561822299031535
0.5968100450756116
0.7112984167321372
0.7725348180345087


In [18]:
score11 = corpus_bleu(col_2, col_1, weights=(1, 0, 0, 0))
score21 = corpus_bleu(col_2, col_1, weights=(0, 1, 0, 0))
score31 = corpus_bleu(col_2, col_1, weights=(0, 0, 1, 0))
score41 = corpus_bleu(col_2, col_1, weights=(0, 0, 0, 1))
print(score11)
print(score21)
print(score31)
print(score41)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.3561822299031535
1.0
1.0
1.0
