In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, KeyedVectors, FastText
import gensim
import time
import random
import matplotlib.pyplot as plt
import string
import nltk
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras_wc_embd import get_dicts_generator

Using TensorFlow backend.


In [2]:
CUDA_VISIBLE_DEVICES = 0,1
from platform import python_version
print(python_version())

3.6.10


In [3]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

['/device:GPU:0']

In [4]:
sentences1 = open('F:/Datasets/Language Identification/x_train.txt', 'r', encoding='utf-8').readlines()
sentences2 = open('F:/Datasets/Language Identification/x_test.txt', 'r', encoding='utf-8').readlines()

In [5]:
sentences1 = pd.DataFrame(sentences1, columns=['Sentence'])
sentences2 = pd.DataFrame(sentences2, columns=['Sentence'])

In [6]:
sentences = pd.concat([sentences1, sentences2])

In [7]:
labels1 = open('F:/Datasets/Language Identification/y_train.txt', 'r', encoding='utf-8').readlines()
labels2 = open('F:/Datasets/Language Identification/y_test.txt', 'r', encoding='utf-8').readlines()

In [8]:
labels1 = pd.DataFrame(labels1, columns=['Language'])
labels2 = pd.DataFrame(labels2, columns=['Language'])

In [9]:
labels = pd.concat([labels1, labels2])

In [10]:
language_dataset = pd.merge(sentences.reset_index(drop=True), labels.reset_index(drop=True), left_index=True, right_index = True)

In [11]:
language_dataset.shape

(235000, 2)

In [12]:
unique_languages = ['eng\n', 'spa\n', 'deu\n', 'ita\n', 'fra\n', 'rus\n', 'ara\n']#list(np.unique(language_dataset['Language']))
language_dataset = language_dataset[language_dataset['Language'].isin(unique_languages)]
language_dataset['Sentence Tokenized'] = language_dataset['Sentence'].apply(lambda x : nltk.word_tokenize(x))

In [13]:
sentences = language_dataset['Sentence Tokenized'].to_list()

In [14]:
time1 = time.time()
model = Word2Vec(sentences, size = 300, window = 5, min_count= 1, negative=5, iter = 100)
time2 = time.time()

In [15]:
time2-time1

86.25551962852478

In [16]:
model.wv.save_word2vec_format('MultiLingualModel.model')


In [17]:
unique_languages = list(np.unique(language_dataset['Language']))

In [18]:
language_dataset = language_dataset.drop(['Sentence Tokenized'], axis = 1)

In [19]:
language_dataset_part1 = language_dataset#[language_dataset['Language'].isin(unique_languages[0:5])]
#language_dataset_part2 = language_dataset[language_dataset['Language'].isin(unique_languages[50:100])]
#language_dataset_part3 = language_dataset[language_dataset['Language'].isin(unique_languages[100:150])]
#language_dataset_part3 = language_dataset[language_dataset['Language'].isin(unique_languages[150:200])]
#language_dataset_part3 = language_dataset[language_dataset['Language'].isin(unique_languages[200:])]

In [20]:
def word2token(word):
    try:
        return model.wv.vocab[word].index
    # If word is not in index return 0. I realize this means that this
    # is the same as the word of index 0 (i.e. most frequent word), but 0s
    # will be padded later anyway by the embedding layer (which also
    # seems dirty but I couldn't find a better solution right now)
    except KeyError:
        return 0


In [21]:
#Create an iterator that formats data from the dataset proper for
# LSTM training

# Sequences will be padded or truncated to this length
MAX_SEQUENCE_LENGTH = 200

# Samples of categories with less than this number of samples will be ignored
DROP_THRESHOLD = 10000

class SequenceIterator:
    def __init__(self, dataset, drop_threshold, seq_length):
        self.dataset = dataset

        self.translator = str.maketrans('', '', string.punctuation + '–')
        self.categories, self.ccount = np.unique(dataset.Language, return_counts=True)
        
        self.seq_length = seq_length
        
        
    def __iter__(self):
        for sent, lang in zip(self.dataset.Sentence, self.dataset.Language):
            # Make all characters lower-case
            sent = sent.lower()
            
            # Clean string of all punctuation
            sent = sent.translate(self.translator)

            words = np.array([word2token(w) for w in sent.split(' ')[:self.seq_length] if w != ''])
                                
            yield (words, lang)

sequences = SequenceIterator(language_dataset_part1, DROP_THRESHOLD, MAX_SEQUENCE_LENGTH)

# Used for generating the labels in the set
cat_dict = {k: v for k, v in zip(sequences.categories, range(len(sequences.categories)))}

set_x = []
set_y = []
for w, c in sequences:
    set_x.append(w)
    set_y.append(cat_dict[c])
    
# Padding sequences with 0.
set_x = pad_sequences(set_x, maxlen=MAX_SEQUENCE_LENGTH, padding='pre', value=0)
set_y = np.array(set_y)

print(set_x.shape)
print(set_y.shape)

(7000, 200)
(7000,)


In [22]:
VALID_PER = 0.05 # Percentage of the whole set that will be separated for validation

total_samples = set_x.shape[0]
n_val = int(VALID_PER * total_samples)
n_train = total_samples - n_val

random_i = random.sample(range(total_samples), total_samples)
train_x = set_x[random_i[:n_train]]
train_y = set_y[random_i[:n_train]]
val_x = set_x[random_i[n_train:n_train+n_val]]
val_y = set_y[random_i[n_train:n_train+n_val]]

print("Train Shapes - X: {} - Y: {}".format(train_x.shape, train_y.shape))
print("Val Shapes - X: {} - Y: {}".format(val_x.shape, val_y.shape))

categories, ccount = np.unique(train_y, return_counts=True)
n_categories = len(categories)

Train Shapes - X: (6650, 200) - Y: (6650,)
Val Shapes - X: (350, 200) - Y: (350,)


In [23]:
w2v_weights = model.wv.vectors
vocab_size, embedding_size = w2v_weights.shape

lstm_model = Sequential()

# Keras Embedding layer with Word2Vec weights initialization
lstm_model.add(Embedding(input_dim=vocab_size,
                    output_dim=300,
                    weights=[w2v_weights],
                    input_length=200,
                    mask_zero=True,
                    trainable=False))

lstm_model.add(Bidirectional(LSTM(100)))
lstm_model.add(Dense(n_categories, activation='softmax'))
lstm_model
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = lstm_model.fit(train_x, train_y, epochs=5, batch_size=64,
                    validation_data=(val_x, val_y), verbose=1)

Train on 6650 samples, validate on 350 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
def make_lstm_format_output(X):
    string1 = X
    string1 = pd.DataFrame(string1, columns =['Sentence'])
    string1['Language'] = "X"

    sequence1 = SequenceIterator(string1, DROP_THRESHOLD, MAX_SEQUENCE_LENGTH)

    # Used for generating the labels in the set
    cat_dict = {k: v for k, v in zip(sequence1.categories, range(len(sequence1.categories)))}

    set_x = []
    set_y = []
    for w, c in sequence1:
        set_x.append(w)
        set_y.append(cat_dict[c])

    # Padding sequences with 0.
    set_x = pad_sequences(set_x, maxlen=MAX_SEQUENCE_LENGTH, padding='pre', value=0)
    return set_x
    

In [25]:
def predict_language(X, lstm_model):
    X = [X]
    X = make_lstm_format_output(X)
    output = lstm_model.model.predict(X)
    ind = np.argmax(output)
    lang = list(cat_dict.keys())
    lang = ['Arabic', 'Deutsche', 'English', 'French', 'Italian', 'Russian', 'Spanish']
    print(lang[ind])

In [26]:
predict_language('Darf ich mal vorbei?', history)
predict_language('Hey Pal. What is up?', history)
predict_language('Buenos días, Estela!', history)
predict_language('Parlez-vous anglais?', history)
predict_language('Добрый день', history)
predict_language('السلام عليكم', history)

Deutsche
English
Spanish
French
Russian
Arabic
