# Multi-lingual text Classification with Muse Emebedding
## What is covered?
1. Load Muse Embiddings
2. Data Cleaning
3. Building 

In [2]:
import numpy as np
import pandas as pd
np.random.seed(0)
from util import Utils

## Load Muse Embeddings

In [3]:
util = Utils()
word_to_index, index_to_words, word_to_vec_map = util.read_muse_vecs('D:\Resources\Muse_Embeddings\wiki.multi.en.vec')

## Data Cleaning and tokenizing 

In [4]:
df = util.read_review_file("amazon-dataset/english/books/train.json")
train_set,y = util.tokenize_reviews(df, keep_text=False, stemming=False, keep_punctuation=True)

df2 = util.read_review_file("amazon-dataset/english/books/test.json")
test_set,y2 = util.tokenize_reviews(df2, keep_text=False, stemming=False, keep_punctuation=True)

print(len(train_set), len(y))
print(train_set[1], y[1])

print(len(test_set), len(y2))
print(test_set[1], y2[1])

1997 1997
['boring', 'total', 'lack', 'clarity'] 0
1996 1996
['refinement', 'needed'] 0


## Model building - Keras LSTM

In [5]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

Using TensorFlow backend.


In [6]:
#converts list of tokenize docs to list of glove indices 
#max_len is a length of doc with maximum length, all other smaller sentences will be padded with 0
def sentences_to_indices(docs, word_to_index, max_len):
    
    m = len(docs)                                
    X_indices = np.zeros((m,max_len))
    for i in range(m):
        j = 0
        for w in docs[i]:
            if w.lower() in word_to_index:
                X_indices[i, j] = word_to_index[w.lower()]
            else:
                X_indices[i, j] = word_to_index["nokey"]
            j = j + 1
    
    return X_indices

In [7]:
docs = [["I", "love", "football"], ["hello", "there"]]
indices = sentences_to_indices(docs, word_to_index, max_len=3)
print(indices)

[[ 82344. 104225.  63850.]
 [ 77419. 175887.      0.]]


In [8]:
#Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    
    vocab_len = len(word_to_index) + 1                  
    emb_dim = 300      
    emb_matrix = np.zeros((vocab_len,emb_dim))
    
    for word, index in word_to_index.items():
        if len(word_to_vec_map[word]) != 300:
            print(word)
        else:    
            emb_matrix[index, :] = word_to_vec_map[word]
    
    embedding_layer = Embedding(input_dim=vocab_len, output_dim=emb_dim,trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [9]:
def my_model(input_shape, word_to_vec_map, word_to_index):
    
    sentence_indices = Input(input_shape, dtype = 'int32')
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(sentence_indices) 
    
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.4)(X)
    X = LSTM(128, return_sequences=False)(X)
    X = Dropout(0.4)(X)
    X = Dense(2)(X)
    X = Activation("softmax")(X)
    
    return Model(input=sentence_indices, output=X)

In [9]:
def docs_to_vector(docs, vec_map):
    vectors = []
    
    for doc in docs:
        vector = np.zeros((300,), dtype=np.float64)
        for token in doc:
            if token.lower() in vec_map:
                vector += vec_map[token.lower()]
            else:
                vector += vec_map["nokey"]
        vector /= len(doc)
        vectors.append(vector)
    return np.array(vectors)
                

In [10]:
def my_model2():
    input_layer = Input(shape=(300,))
    X = Dense(128)(input_layer)
    X = Activation("relu")(X)
    X = Dense(2)(X)
    X = Activation("softmax")(X)
    
    return Model(input=input_layer, output=X)
    

In [11]:
#get length of longest document
maxLen = max([len(doc) for doc in train_set])
print(maxLen)

maxLen2 = max([len(doc) for doc in test_set])
print(maxLen2)

model2 = my_model2()
model2.summary()
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

W0714 09:52:55.508239 15384 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0714 09:52:55.527189 15384 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0714 09:52:55.532203 15384 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

  
W0714 09:52:55.563162 15384 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0714 09:52:5

26
18
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               38528     
_________________________________________________________________
activation_1 (Activation)    (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 38,786
Trainable params: 38,786
Non-trainable params: 0
_________________________________________________________________


In [12]:
def convert_to_one_hot(y, C):
    Y = np.eye(C)[y.reshape(-1)]
    return Y


X_train_indices = docs_to_vector(train_set, word_to_vec_map)
Y_train_oh = convert_to_one_hot(np.array(y), C=2)

X_test_indices =  docs_to_vector(test_set, word_to_vec_map)
Y_test_oh = convert_to_one_hot(np.array(y2), C=2)

model2.fit(X_train_indices, Y_train_oh, epochs = 4, batch_size = 32, shuffle=True, validation_data=(X_test_indices, Y_test_oh))

W0714 09:53:00.106420 15384 deprecation.py:323] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0714 09:53:00.148077 15384 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 1997 samples, validate on 1996 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1ceaf159860>

In [11]:
model = my_model((maxLen,), word_to_vec_map, word_to_index)
model.summary()

W0714 01:30:33.551189 10000 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0714 01:30:33.567146 10000 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0714 01:30:34.072847 10000 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0714 01:30:34.080825 10000 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session in

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 26)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 26, 300)           59997000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 26, 128)           219648    
_________________________________________________________________
dropout_1 (Dropout)          (None, 26, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
__________

  


In [12]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

W0714 01:30:36.947109 10000 deprecation_wrapper.py:119] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [13]:
def convert_to_one_hot(y, C):
    Y = np.eye(C)[y.reshape(-1)]
    return Y


X_train_indices = sentences_to_indices(train_set, word_to_index, maxLen)
Y_train_oh = convert_to_one_hot(np.array(y), C=2)

X_test_indices =  sentences_to_indices(test_set, word_to_index, maxLen)
Y_test_oh = convert_to_one_hot(np.array(y2), C=2)



In [14]:
model.fit(X_train_indices, Y_train_oh, epochs = 4, batch_size = 32, shuffle=True, validation_data=(X_test_indices, Y_test_oh))

W0714 01:30:37.102441 10000 deprecation.py:323] From C:\Users\Abhijeet\Miniconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 1997 samples, validate on 1996 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x2e905539390>

## Testing Model on German text

In [37]:
word_to_index_de, index_to_words_de, word_to_vec_map_de = util.read_muse_vecs('D:\Resources\Muse_Embeddings\wiki.multi.de.vec')

In [38]:
df3 = util.read_review_file("amazon-dataset/german/books/test.json")
test_set_de,y3 = util.tokenize_reviews(df3, keep_text=False, stemming=False, keep_punctuation=True)

print(len(test_set_de), len(y3))
print(test_set_de[1], y3[1])

2000 2000
['e', 'musste', 'ja', 'kommen', '...', '...'] 0


In [51]:
X_test_de_indices =  docs_to_vector(test_set_de, word_to_vec_map_de)
Y_test_de_oh = convert_to_one_hot(np.array(y3), C=2)


[-0.00653661  0.02699564  0.00302272  0.02870599 -0.01389753 -0.00338064
  0.00559786 -0.03346153  0.00070467  0.02043842 -0.03139933  0.00822137
 -0.00539226 -0.04022262  0.0118907  -0.05106944 -0.01170563  0.02477052
  0.01984421  0.02599529  0.01040407  0.00016941 -0.03355211 -0.0111978
 -0.02845445 -0.02703157 -0.00537891 -0.01917961  0.01048362  0.03906411
 -0.03453014  0.04529503 -0.06386306  0.00661982  0.00041111 -0.01415118
  0.00294962 -0.02498741  0.01491031 -0.0217604  -0.00218333  0.000658
 -0.06380795 -0.00809302 -0.01447022  0.00931652  0.02222784  0.01318663
  0.04336886 -0.02933589  0.01648976 -0.03507637 -0.00878142 -0.01914704
 -0.01513701  0.02098027 -0.00102045 -0.02033681  0.0313291   0.05830464
 -0.02235418 -0.02987555  0.05824889 -0.00662939 -0.03876732 -0.02090494
  0.00667941  0.03024812 -0.01016959  0.01416782  0.00364605  0.00987893
  0.02990993 -0.03740573 -0.03310736  0.04370545  0.04237576  0.02729444
  0.01277791 -0.00365579  0.01817824 -0.02823714  0.01

In [52]:
loss,acc = model2.evaluate(x=X_test_de_indices, y=Y_test_de_oh, batch_size=32, verbose=1)
print(acc)
print(len(index_to_words_de))

0.681
199982


## Testing Model on French text

In [13]:
word_to_index_fr, index_to_words_fr, word_to_vec_map_fr = util.read_muse_vecs('D:\Resources\Muse_Embeddings\wiki.multi.fr.vec')

In [15]:
df4 = util.read_review_file("amazon-dataset/french/books/test.json")
test_set_fr,y4 = util.tokenize_reviews(df4, keep_text=False, stemming=False, keep_punctuation=True)

print(len(test_set_fr), len(y4))
print(test_set_fr[1], y4[1])

2000 2000
['super', 'recettes', 'faciles', 'à', 'réaliser'] 1


In [16]:
X_test_de_indices =  docs_to_vector(test_set_fr, word_to_vec_map_fr)
Y_test_de_oh = convert_to_one_hot(np.array(y4), C=2)

In [17]:
loss,acc = model2.evaluate(x=X_test_de_indices, y=Y_test_de_oh, batch_size=32, verbose=1)
print(acc)

0.7245


NameError: name 'index_to_words_de' is not defined