## Arabic Part of Speech Tagger - Training Notebook 

### Introduction

In this notebook we continue our work towards building arabic part-of-speech tagger

This notebook is intended to train an *_Long Short Term Memory Network_* for _Sequence Modeling of the language_ 

In [1]:
import pandas as pd

In [2]:
import keras

Using TensorFlow backend.


In [3]:
from itertools import chain

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from keras.preprocessing.sequence import pad_sequences

In [6]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

In [7]:
dataset = pd.read_pickle('data/tagset.pickle')

In [8]:
dataset = dataset.drop(['c', 'v'], axis=1)

In [7]:
dataset.head()

Unnamed: 0,t,tags
0,"[بسم, ٱلله, ٱلرحمن, ٱلرحيم]","[N, PN, ADJ, ADJ]"
1,"[ٱلحمد, لله, رب, ٱلعلمين]","[N, PN, N, N]"
2,"[ٱلرحمن, ٱلرحيم]","[ADJ, ADJ]"
3,"[ملك, يوم, ٱلدين]","[N, N, N]"
4,"[إياك, نعبد, وإياك, نستعين]","[PRON, V, PRON, V]"


In [9]:
sentences, sentence_tags = dataset.t.values.tolist(), dataset.tags.values.tolist()

In [10]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(sentences, sentence_tags, test_size=0.2)

In [11]:
words = list(chain.from_iterable(sentences))

In [12]:
tags = list(chain.from_iterable(sentence_tags))

In [13]:
tags = set(tags)

In [14]:
words = set(words)

In [15]:
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

In [16]:
word2index

{'وملءكته': 2,
 'وأعانه': 3,
 'ويولج': 4,
 'عني': 5,
 'لخبير': 6,
 'وٱلركب': 7,
 'نيلا': 8,
 'ءاخرين': 9,
 'مدكر': 10,
 'شيخ': 11,
 'صدقوا': 12,
 'ملق': 13,
 'فكف': 14,
 'ٱلتوب': 15,
 'ٱلخطاب': 16,
 'ربطنا': 17,
 'متراكبا': 18,
 'وأنه': 19,
 'لكذبون': 20,
 'تمدن': 21,
 'ويعذب': 22,
 'يغير': 23,
 'ٱلحسره': 24,
 'وٱلفجر': 25,
 'ثمودا': 26,
 'قاعا': 27,
 'أعينهم': 28,
 'بريء': 29,
 'كبره': 30,
 'ٱلٱسم': 31,
 'شهيدا': 32,
 'ويفسدون': 33,
 'يخروا': 34,
 'غفر': 35,
 'ٱلتابوت': 36,
 'مكروها': 37,
 'فأتي': 38,
 'لمثوبه': 39,
 'وأنذر': 40,
 'ذلكم': 41,
 'ورتل': 42,
 'تضحكون': 43,
 'رضيتم': 44,
 'بٱلقلم': 45,
 'سمعتموه': 46,
 'يمارون': 47,
 'سأصليه': 48,
 'وٱلمترديه': 49,
 'تزغ': 50,
 'يراءون': 51,
 'عيسي': 52,
 'ملأ': 53,
 'ٱلملءا': 54,
 'أرني': 55,
 'تستفتيان': 56,
 'تجره': 57,
 'ألفت': 58,
 'معاد': 59,
 'نبين': 60,
 'لفسدتا': 61,
 'ذراعا': 62,
 'ماذا': 63,
 'تضرونه': 64,
 'ٱطمس': 65,
 'بفتنين': 66,
 'رفعها': 67,
 'أنعمنا': 68,
 'أعجبتكم': 69,
 'مبارك': 70,
 'فليتوكل': 71,
 'ٱلذنوب': 72,
 'ٱلس

In [17]:
tag2index

{'CERT': 1,
 'RET': 2,
 'IMPN': 3,
 'COND': 4,
 'N': 5,
 'REL': 6,
 'ANS': 7,
 'EXP': 8,
 'SUR': 9,
 'AVR': 10,
 'CONJ': 11,
 'T': 12,
 'DEM': 13,
 'SUB': 14,
 'AMD': 15,
 'PREV': 16,
 'FUT': 17,
 'V': 18,
 'ACC': 19,
 'EXH': 20,
 'SUP': 21,
 'P': 22,
 'PN': 23,
 'NEG': 24,
 'RES': 25,
 'INT': 26,
 'PRON': 27,
 'INC': 28,
 'PRO': 29,
 'EXL': 30,
 'LOC': 31,
 'ADJ': 32,
 'INTG': 33,
 '-PAD-': 0}

In [18]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []

In [19]:
for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    train_sentences_X.append(s_int)

In [26]:
train_sentences_X

[[554, 14687, 8785, 6852, 12876, 5812, 7907, 9260, 1229, 8450, 4473],
 [14681,
  7823,
  7243,
  9718,
  7907,
  5468,
  9812,
  1805,
  3066,
  12495,
  1589,
  11570,
  3148,
  11546,
  4960,
  5161,
  11863,
  4896,
  11521,
  9889,
  12894,
  2266],
 [11640, 2956, 11249, 2104, 3567, 14704, 5253, 2923],
 [5253, 8583],
 [2771, 13753, 5227, 4960, 9170, 3498, 11980],
 [434,
  6344,
  8380,
  10486,
  7577,
  8871,
  6852,
  5929,
  8465,
  6103,
  40,
  11699,
  12132,
  1929],
 [13768, 8034, 9170, 11781, 593, 1843],
 [14181, 14101, 12810, 3587, 13747],
 [5674, 1099, 5289],
 [12065, 13198, 304, 10253, 2508, 9567, 876, 3319, 13658, 12718, 7985],
 [14181, 12810, 2992, 13245, 6852, 9073, 6852, 1142, 80, 7567, 13542, 13245],
 [1252, 2992, 8052, 9219, 9889, 6090, 13508, 11372, 1079, 2703, 4758, 14624],
 [1252,
  2992,
  8052,
  845,
  9669,
  6852,
  8054,
  8465,
  1163,
  8054,
  10884,
  11372,
  9145,
  11372,
  2096,
  7579,
  4191,
  9086,
  12242,
  8565,
  8104,
  10001,
  12922,
  

In [20]:
for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)

In [21]:
for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])

In [22]:
for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])

In [23]:
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[1634, 13059, 6344, 8121, 5483, 3903, 9215, 14114, 606, 6833, 1982, 336]
[2299, 7418, 14578, 1204, 5555, 10716, 7866, 7987, 12012, 244, 5322, 6812, 9146, 2348, 13809, 5930, 10905]
[1, 18, 27, 5, 18, 23, 22, 5, 5, 18, 31, 5]
[18, 18, 22, 5, 22, 5, 6, 18, 18, 6, 18, 22, 5, 5, 6, 18, 27]


In [24]:
MAX_LENGTH = len(max(train_sentences_X, key=len))
print(MAX_LENGTH)

128


In [25]:
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
 
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[ 1634 13059  6344  8121  5483  3903  9215 14114   606  6833  1982   336
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
[ 2299  7418 14578  1204  5555 10716  7866  7987 12012   244  5322  6812
  9146  2348 13809  5930 10905     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
 

In [28]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy', ignore_class_accuracy(0)])
 
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 128, 128)          1883776   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128, 512)          788480    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 128, 34)           17442     
_________________________________________________________________
activation_2 (Activation)    (None, 128, 34)           0         
Total params: 2,689,698
Trainable params: 2,689,698
Non-trainable params: 0
_________________________________________________________________


In [29]:
import numpy as np

In [30]:
## One-Hot Encoded tags
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [41]:
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
print(cat_train_tags_y[0])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [49]:
cat_train_tags_y[0][1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

### Train ! Boom

In [32]:
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=50, validation_split=0.2)

Train on 3977 samples, validate on 995 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1a2ab07cf8>

In [27]:
from keras import backend as K
 
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [31]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [1]:
predictions = model.predict(test_sentences_X)

NameError: name 'model' is not defined

In [46]:
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))

[['V', 'ACC', 'CERT', 'V', 'P', 'P', 'N', 'ACC', 'ACC', 'CERT', 'V', 'ACC', 'ACC', 'PRON', 'N', 'ADJ', 'V', 'P', 'N', 'REL', 'REL', 'V', 'REL', 'PRON', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-'], ['N', 'V', 'PRON', 'N', 'PRON', 'PRON', 'N', 'N', 'N', 'NEG', 'V', 'P', 'N', 'PN', 'N', 'N', 'N', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-

In [48]:
def predict(sentence):
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    preds = model.predict([s_int])
    return logits_to_tokens(preds, {i: t for t, i in tag2index.items()})

In [49]:
predict(test_sentences_X[0])

ValueError: Error when checking input: expected input_2 to have shape (88,) but got array with shape (1,)

In [33]:
model_json = model.to_json()

In [34]:
model_json

'{"class_name": "Sequential", "config": {"name": "sequential_2", "layers": [{"class_name": "Embedding", "config": {"name": "embedding_2", "trainable": true, "batch_input_shape": [null, null], "dtype": "float32", "input_dim": 14717, "output_dim": 128, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": null}}, {"class_name": "Bidirectional", "config": {"name": "bidirectional_2", "trainable": true, "layer": {"class_name": "LSTM", "config": {"name": "lstm_2", "trainable": true, "return_sequences": true, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "unifor

In [35]:
with open("model_num.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights("model_num.h5")