In [3]:
#Import the packages that we might need
import pandas as pd
import numpy as np
from tensorflow import keras
import tensorflow as tf
import string
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [4]:
#Prepare the data and import

#Reading the data
lines = pd.read_table('swe.txt', names=['eng', 'swe','other'])[['eng', 'swe']]

In [5]:
import string

def preprocess_data(df):
    # Lowercase all the characters in all the sentences
    df.eng = df.eng.apply(lambda x: x.lower())
    df.swe = df.swe.apply(lambda x: x.lower())

    # Remove all the quote from the sentences
    df.eng = df.eng.apply(lambda x: x.replace("'", ""))
    df.swe = df.swe.apply(lambda x: x.replace("'", ""))
    df.eng = df.eng.apply(lambda x: x.replace('"', ""))
    df.swe = df.swe.apply(lambda x: x.replace('"', ""))

    # Remove all the punctuations
    exclude = set(string.punctuation)
    df.eng = df.eng.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    df.swe = df.swe.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

    # Remove all the numbers
    df.eng = df.eng.apply(lambda x: ''.join(ch for ch in x if not ch.isdigit()))
    df.swe = df.swe.apply(lambda x: ''.join(ch for ch in x if not ch.isdigit()))

    # Remove all possible extra spaces
    df.eng = df.eng.apply(lambda x: x.strip())
    df.swe = df.swe.apply(lambda x: x.strip())

    # Add the start and end tokens to the Swedish sentences
    df.swe = df.swe.apply(lambda x: 'START_ ' + x + ' _END')

    return df
preprocessed_df = preprocess_data(lines)

In [6]:
lines.sample(10)

Unnamed: 0,eng,swe
10086,i had a good holiday,START_ jag hade en bra ledighet _END
16336,why are you angry with him,START_ varför är du arg på honom _END
20299,could you drop me off at the library,START_ kan du släppa av mig vid biblioteket _END
12639,i like to go to school,START_ jag tycker om att gå i skolan _END
4928,whats your name,START_ vad är ditt namn _END
6028,tom cant be dead,START_ tom kan inte vara död _END
11358,do you have to go now,START_ måste ni gå nu _END
8086,can i get a picture,START_ får jag ta en bild _END
21737,whats that how am i supposed to know,START_ ”vad är det där” ”hur ska jag kunna vet...
22916,tom didnt know that marys house was so close t...,START_ tom visste inte att marys hus var så nä...


In [7]:
# English
all_eng_words = set(word for eng in lines.eng for word in eng.split())

# Swedish
all_swe_words = set(word for swe in lines.swe for word in swe.split())

# Max length of source sequence (English)
max_length_src = max(len(sent.split()) for sent in lines.eng)

# Max length of target sequence (Swedish)
max_length_tar = max(len(sent.split()) for sent in lines.swe)

In [8]:
max_length_tar

69

In [9]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_swe_words))

#Number of unique input words
num_encoder_tokens = len(all_eng_words)

#Number of unique output words
num_decoder_tokens = len(all_swe_words)
num_decoder_tokens += 1 # For zero padding

#Create a dictionary to convert words to numbers
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

#Create a dictionary to convert numbers to words
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [11]:
lines = shuffle(lines)
X, y = lines.eng, lines.swe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)
X_train.shape, X_test.shape

((18613,), (4654,))

In [12]:
#Save train and test data to pickle for easier reproducibility
X_train.to_pickle("X_train.pkl")
X_test.to_pickle("X_test.pkl")

In [13]:
def generate_batch(X=X_train, y=y_train, batch_size=128):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src), dtype=np.float32)
            decoder_input_data = np.zeros((batch_size, max_length_tar), dtype=np.float32)
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype=np.float32)
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                # encoder input seq
                encoder_input_data[i, :len(input_text.split())] = [input_token_index[word] for word in input_text.split()]
                # decoder input seq (without START_)
                decoder_input_data[i, :len(target_text.split())-1] = [target_token_index[word] for word in target_text.split()[1:]]
                # decoder target seq (without END_)
                decoder_target_data[i, :len(target_text.split())-1, :] = tf.keras.utils.to_categorical([target_token_index[word] for word in target_text.split()[:-1]], num_classes=num_decoder_tokens)
            yield([encoder_input_data, decoder_input_data], decoder_target_data)


In [14]:
latent_dim = 50

In [15]:
#Encoder
encoder_inputs = keras.layers.Input(shape=(None,))
enc_emb =  keras.layers.Embedding(num_encoder_tokens+1, latent_dim, mask_zero = True)(encoder_inputs)

encoder_lstm = keras.layers.LSTM(latent_dim,return_sequences=True,return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.layers.Input(shape=(None,))
dec_emb_layer = keras.layers.Embedding(num_decoder_tokens+1, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=encoder_states)

decoder_dense = keras.layers.Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])



2023-04-21 11:24:03.791114: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 50

In [17]:
model.fit_generator(generator=generate_batch(X_train, y_train, batch_size=batch_size),
                    steps_per_epoch=train_samples//batch_size,
                    epochs=epochs,
                    validation_data=generate_batch(X_test, y_test, batch_size=batch_size),
                    validation_steps=val_samples//batch_size)

  model.fit_generator(generator=generate_batch(X_train, y_train, batch_size=batch_size),


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fa9c0035ff0>

In [18]:
#Saving the model weights
model.save_weights('word_weight.h5')

In [28]:
#Encode the input sequence to get the "thought vectors"
encoder_model = keras.models.Model(encoder_inputs, encoder_states)

#Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = keras.layers.Input(shape=(latent_dim,))
decoder_state_input_c = keras.layers.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]

decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = keras.models.Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)
    

In [29]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop word.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [32]:
test_gen = generate_batch(X_test, y_test, batch_size=1)


In [33]:
import random

# set the number of random samples to generate
num_samples = 10

# generate random indices to select random samples from test set
random_indices = random.sample(range(len(X_test)), num_samples)

for i in range(num_samples):
    k = random_indices[i]
    (input_seq, actual_output), _ = next(test_gen)
    decoded_sentence = decode_sequence(input_seq)
    print('Input Question/English sentence:', X_test[k:k+1].values[0])
    print('Actual Answer/Swedish translation:', y_test[k:k+1].values[0][6:-4])
    print('Predicted Answer/Swedish translation:', decoded_sentence[:-4])
    print('') # add empty line for readability


Input Question/English sentence: i wonder why tom is naked
Actual Answer/Swedish translation:  jag undrar varför tom är naken 
Predicted Answer/Swedish translation:  START_ per sött innan lat innan lat innan lat i

Input Question/English sentence: ill wait a week
Actual Answer/Swedish translation:  jag ska vänta en vecka 
Predicted Answer/Swedish translation:  START_ smakar del smakar del varje del varje del v

Input Question/English sentence: tom opened his suitcase
Actual Answer/Swedish translation:  tom öppnade sin resväska 
Predicted Answer/Swedish translation:  START_ bröt fattig varje ung varje ung varje ung v

Input Question/English sentence: dont let him down
Actual Answer/Swedish translation:  gör honom inte besviken 
Predicted Answer/Swedish translation:  START_ mannen skillnad mannen skillnad föll skil

Input Question/English sentence: you couldve gone
Actual Answer/Swedish translation:  ni kunde ha stuckit 
Predicted Answer/Swedish translation:  START_ nytt fem gånger fem g