# Multi-lingual text Classification with Muse Emebedding (LSTM)
## What is covered?
1. Load Muse Embiddings
2. Data Cleaning and tokenize
3. Convert tokenized documents to the embedding vectors
3. Building Simple Keras Model
4. Test Model on french and German text

In [1]:
import numpy as np
import pandas as pd
np.random.seed(0)
from util import Utils

## Load Muse Embeddings - English

In [2]:
util = Utils()
word_to_index, index_to_words, word_to_vec_map = util.read_muse_vecs('D:\Resources\Muse_Embeddings\wiki.multi.en.vec')

In [3]:
#loading training dataset into a dataframe
df = util.read_review_file("amazon-dataset/english/books/train.json")
#tokenize loaded dataframe
train_set,y = util.tokenize_reviews(df, keep_text=False, stemming=False, keep_punctuation=True)

#loading testing dataset into a dataframe
df2 = util.read_review_file("amazon-dataset/english/books/test.json")
#tokenize loaded dataframe
test_set,y2 = util.tokenize_reviews(df2, keep_text=False, stemming=False, keep_punctuation=True)

print(len(train_set), len(y))
print(train_set[1], y[1])

1997 1997
['boring', 'total', 'lack', 'clarity'] 0


## Building Model - Keras (LSTM)

In [4]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
import os
import tensorflow as tf
#disable warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

Using TensorFlow backend.


In [5]:
#Keras Model
def my_model(maxLen):
    
    input_layer = Input(shape = (maxLen,300))
    
    X = LSTM(128, return_sequences=True)(input_layer)
    X = Dropout(0.4)(X)
    X = LSTM(128, return_sequences=False)(X)
    X = Dropout(0.4)(X)
    X = Dense(2)(X)
    X = Activation("softmax")(X)
    
    return Model(input=input_layer, output=X)

In [6]:
#get max length doc for padding purpose
def get_maxlength_doc(docs):
    maxLen = max([len(doc) for doc in docs])
    return maxLen

def docs_to_vector(docs, vec_map, maxLen):
    vectors = []
    for doc in docs:
        vector = []
        for token in doc:
            if token.lower() in vec_map:
                vector.append(vec_map[token.lower()])
            else:
                vector.append(vec_map["nokey"])
        #padd sequence for max length
        pad = maxLen - len(vector)
        if pad > 0:
            padv = np.zeros((300,),dtype=np.float64)
            for i in range(pad):
                vector.append(padv)
        vectors.append(vector)
        
    return np.array(vectors)

#convert lables to one-hot vectors
def convert_to_one_hot(y, C):
    Y = np.eye(C)[y.reshape(-1)]
    return Y

maxLen = get_maxlength_doc(test_set + train_set)


In [7]:
model = my_model(maxLen)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

W0714 11:11:34.847043 14912 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0714 11:11:34.864484 14912 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0714 11:11:34.868470 14912 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0714 11:11:35.095888 14912 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 26, 300)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 26, 128)           219648    
_________________________________________________________________
dropout_1 (Dropout)          (None, 26, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
Total para

## Train the model

In [8]:
X_train_vectors = docs_to_vector(train_set, word_to_vec_map, maxLen)
Y_train_oh = convert_to_one_hot(np.array(y), C=2)

X_test_vectors =  docs_to_vector(test_set, word_to_vec_map, maxLen)
Y_test_oh = convert_to_one_hot(np.array(y2), C=2)

model.fit(X_train_vectors, Y_train_oh, epochs = 4, batch_size = 32, shuffle=True, validation_data=(X_test_vectors, Y_test_oh))

W0714 11:11:44.295188 14912 deprecation.py:323] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 1997 samples, validate on 1996 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x23120d34c50>

## Evaluating the Model on Geraman and French

In [14]:
#evaluate the model with the provided language text
def evaluate_model(model, lang="de"):
    word_to_index_l, index_to_words_l, word_to_vec_map_l = 0,0,0
    df3 = 0
    if lang is "de":
        word_to_index_l, index_to_words_l, word_to_vec_map_l = util.read_muse_vecs('D:\Resources\Muse_Embeddings\wiki.multi.de.vec')
        df3 = util.read_review_file("amazon-dataset/german/books/test.json")
    elif lang is "fr":
        word_to_index_l, index_to_words_l, word_to_vec_map_l = util.read_muse_vecs('D:\Resources\Muse_Embeddings\wiki.multi.fr.vec')
        df3 = util.read_review_file("amazon-dataset/french/books/test.json")
    
    test_set_l,y3 = util.tokenize_reviews(df3, keep_text=False, stemming=False, keep_punctuation=True)
    
    X_test_l_vectors =  docs_to_vector(test_set_l, word_to_vec_map_l, maxLen)
    Y_test_l_oh = convert_to_one_hot(np.array(y3), C=2)
    print(Y_test_l_oh)
    
    loss,acc = model.evaluate(x=X_test_l_vectors, y=Y_test_l_oh, batch_size=32, verbose=1)
    return acc
        

## Testing Model on German Text

In [12]:
print(evaluate_model(model, lang="de"))

0.7015


## Testing Model on French Text

In [15]:
print(evaluate_model(model, lang="fr"))

[[0. 1.]
 [0. 1.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 0.]]


ValueError: Error when checking input: expected input_1 to have 3 dimensions, but got array with shape (2000, 1)