# Multi-lingual text Classification with Muse Emebedding (Vector Average)
## What is covered?
1. Load Muse Embiddings
2. Data Cleaning and tokenize
3. Convert tokenized documents to the embedding vectors calulted by averaging
3. Building Simple Keras Model
4. Test Model on french and German text

In [12]:
import numpy as np
import pandas as pd
np.random.seed(0)
from util import Utils

## Load Muse Embeddings - English

In [13]:
util = Utils()
word_to_index, index_to_words, word_to_vec_map = util.read_muse_vecs('D:\Resources\Muse_Embeddings\wiki.multi.en.vec')

## Data Cleaning and tokenizing 

In [14]:
#loading training dataset into a dataframe
df = util.read_review_file("amazon-dataset/english/books/train.json")
#tokenize loaded dataframe
train_set,y = util.tokenize_reviews(df, keep_text=False, stemming=False, keep_punctuation=True)

#loading testing dataset into a dataframe
df2 = util.read_review_file("amazon-dataset/english/books/test.json")
#tokenize loaded dataframe
test_set,y2 = util.tokenize_reviews(df2, keep_text=False, stemming=False, keep_punctuation=True)

print(len(train_set), len(y))
print(train_set[1], y[1])

1997 1997
['boring', 'total', 'lack', 'clarity'] 0


## Building Model - Keras

In [15]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
import os
import tensorflow as tf
#disable warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [16]:
#Keras Model
def my_model():
    input_layer = Input(shape=(300,))
    X = Dense(128)(input_layer)
    X = Activation("relu")(X)
    X = Dense(2)(X)
    X = Activation("softmax")(X)
    return Model(input=input_layer, output=X)

In [17]:
#convert tokenized docs to vector embeddings by averaging
def docs_to_vector(docs, vec_map):
    vectors = []
    
    for doc in docs:
        vector = np.zeros((300,), dtype=np.float64)
        for token in doc:
            if token.lower() in vec_map:
                vector += vec_map[token.lower()]
            else:
                vector += vec_map["nokey"]
        vector /= len(doc)
        vectors.append(vector)
    return np.array(vectors)

#convert lables to one-hot vectors
def convert_to_one_hot(y, C):
    Y = np.eye(C)[y.reshape(-1)]
    return Y

In [18]:
model = my_model()
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 300)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               38528     
_________________________________________________________________
activation_3 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_4 (Activation)    (None, 2)                 0         
Total params: 38,786
Trainable params: 38,786
Non-trainable params: 0
_________________________________________________________________


  


## Train the model

In [19]:
X_train_vectors = docs_to_vector(train_set, word_to_vec_map)
Y_train_oh = convert_to_one_hot(np.array(y), C=2)

X_test_vectors =  docs_to_vector(test_set, word_to_vec_map)
Y_test_oh = convert_to_one_hot(np.array(y2), C=2)

model.fit(X_train_vectors, Y_train_oh, epochs = 4, batch_size = 32, shuffle=True, validation_data=(X_test_vectors, Y_test_oh))

Train on 1997 samples, validate on 1996 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1ab2b13fdd8>

## Evaluating the Model on Geraman and French

In [20]:
#evaluate the model with the provided language text
def evaluate_model(model, lang="de"):
    word_to_index_l, index_to_words_l, word_to_vec_map_l = 0,0,0
    df3 = 0
    if lang is "de":
        word_to_index_l, index_to_words_l, word_to_vec_map_l = util.read_muse_vecs('D:\Resources\Muse_Embeddings\wiki.multi.de.vec')
        df3 = util.read_review_file("amazon-dataset/german/books/test.json")
    if lang is "fr":
        word_to_index_l, index_to_words_l, word_to_vec_map_l = util.read_muse_vecs('D:\Resources\Muse_Embeddings\wiki.multi.fr.vec')
        df3 = util.read_review_file("amazon-dataset/french/books/test.json")
    
    test_set_l,y3 = util.tokenize_reviews(df3, keep_text=False, stemming=False, keep_punctuation=True)
    
    X_test_l_vectors =  docs_to_vector(test_set_l, word_to_vec_map_l)
    Y_test_l_oh = convert_to_one_hot(np.array(y3), C=2)
    
    loss,acc = model.evaluate(x=X_test_l_vectors, y=Y_test_l_oh, batch_size=32, verbose=1)
    return acc
        

## Testing Model on German Text

In [21]:
print(evaluate_model(model, lang="de"))

0.705


## Testing Model on French Text

In [22]:
print(evaluate_model(model, lang="fr"))

0.7245
