In [1]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM, Dropout,Embedding, Bidirectional

In [2]:
file = open("../../../Data/stopwords.txt", "r", encoding="utf-8")
STOPWORDS = file.read().split()
file.close()

In [3]:
vocab_size = 50000
embedding_dim = 128
max_length = 200
oov_tok = '<OOV>' #  Out of Vocabulary
training_portion = 0.8

In [4]:
articles = []
labels = []

with open("../data_preprocessing/preprocessed_data/data_main.csv", 'r',encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        try:
            labels.append(row[3])   
        except:
            continue
        article = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)

In [5]:
labels[:2]

['0', '0']

In [6]:
articles[:2]

['كون خير',
 'صبح وانا جوايا ميه حسره ووجع قلب محسيتهمش سنتين nمنك لله يا لحه n تيران وصنافير مصريه']

In [7]:
len(labels),len(articles)

(58198, 58198)

In [8]:
print(labels[0])

0


In [9]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_labels = labels[train_size:]

In [10]:
print('train_articles' ,len(train_articles))
print('train_labels', len(train_labels))
print('validation_articles', len(validation_articles))
print('validation_labels', len(validation_labels))

train_articles 46558
train_labels 46558
validation_articles 11640
validation_labels 11640


In [11]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

In [12]:
# import pickle
# filename = f'../../utils/tokenizer.sav'
# pickle.dump(tokenizer, open(filename, 'wb'))

In [13]:
word_index

{'<OOV>': 1,
 'ان': 2,
 'كنت': 3,
 'كون': 4,
 'انه': 5,
 'يوم': 6,
 'يا': 7,
 'الله': 8,
 'لان': 9,
 'شيء': 10,
 'اللي': 11,
 'حقا': 12,
 'عمل': 13,
 'شعر': 14,
 'اي': 15,
 'جيد': 16,
 'او': 17,
 'اوه': 18,
 'شخص': 19,
 'ناس': 20,
 'امر': 21,
 'حتي': 22,
 'مش': 23,
 'ايضا': 24,
 'n': 25,
 'انت': 26,
 'راءع': 27,
 'عيد': 28,
 'امل': 29,
 'اعتقد': 30,
 'انك': 31,
 'انها': 32,
 'انن': 33,
 'تمني': 34,
 'مره': 35,
 'يجب': 36,
 'خير': 37,
 'اذا': 38,
 'حصل': 39,
 'غايه': 40,
 'بدو': 41,
 'اسف': 42,
 'ذهب': 43,
 'اوليمبياد': 44,
 'حدث': 45,
 'سعيد': 46,
 'حسن': 47,
 'اسبوع': 48,
 'قلب': 49,
 'تكون': 50,
 'بدا': 51,
 'كثير': 52,
 'خوف': 53,
 'عام': 54,
 'الا': 55,
 'منزل': 56,
 'والله': 57,
 'داءما': 58,
 'افضل': 59,
 'حزن': 60,
 'مكن': 61,
 'حاجه': 62,
 'اكثر': 63,
 'احد': 64,
 'سبب': 65,
 'اعرف': 66,
 'احب': 67,
 'سعاده': 68,
 'اول': 69,
 'اخر': 70,
 'فعل': 71,
 'وانا': 72,
 'اني': 73,
 'واحد': 74,
 'حاول': 75,
 'حياه': 76,
 'ده': 77,
 'ربما': 78,
 'قول': 79,
 'شعور': 80,
 'خايفه': 81,
 'شك

In [14]:
tokenizer.texts_to_sequences(['what will be the weather tommorrow morning ?'])

[[24420, 1, 1, 2487, 1, 1, 1]]

In [15]:
train_sequences = tokenizer.texts_to_sequences(train_articles)
len(train_sequences)

46558

In [16]:
train_sequences[0]

[4, 37]

In [17]:
train_padded = pad_sequences(train_sequences, maxlen=max_length,)

In [18]:
len(train_padded),len(train_padded[0])

(46558, 200)

In [19]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length)

In [20]:
set(labels)

{'0', '1'}

In [21]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [22]:
validation_label_seq

array([[1],
       [2],
       [1],
       ...,
       [2],
       [1],
       [1]])

In [23]:
# train_labels[744], training_label_seq[744]

In [24]:
model = Sequential()
model.add(Embedding(vocab_size,embedding_dim ))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(3,activation='softmax' ))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         6400000   
                                                                 
 dropout (Dropout)           (None, None, 128)         0         
                                                                 
 bidirectional (Bidirectiona  (None, 256)              263168    
 l)                                                              
                                                                 
 dense (Dense)               (None, 3)                 771       
                                                                 
Total params: 6,663,939
Trainable params: 6,663,939
Non-trainable params: 0
_________________________________________________________________


In [25]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy',optimizer=opt,metrics=['accuracy'])

  super(Adam, self).__init__(name, **kwargs)


In [26]:
num_epochs = 1
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

1455/1455 - 805s - loss: 0.4701 - accuracy: 0.7680 - val_loss: 0.3954 - val_accuracy: 0.8155 - 805s/epoch - 553ms/step


In [27]:
txt_list = ["ما حال الطقس اليوم ؟","احجز لى معاد اليوم الساعة 5","كان بوم جميل جدا"," اقترح فيلم جديد لاشاهده"]
labels = ['schedule','general','weather','recommendation','greeting','thank']
answer = list()
for txt in txt_list:
    seq = tokenizer.texts_to_sequences([txt])
    padded = pad_sequences(seq, maxlen=max_length)
    pred = model.predict(padded)
    try:
        label = labels[np.argmax(pred)-1] 
        answer.append(label)
    except:
        answer.append('general')
print(answer)

['general', 'general', 'general', 'general']


In [28]:
txt_list = ["انا حزين جدا","سوف ارسم قطتى حتى انسى","ابقى فكرنى بمعاد الدكتور بكرة","الجو عامل ايه فى المانيا","عايز فيلم اكشن جديد اتفرج عليه"]

answer = list()
for txt in txt_list:
    seq = tokenizer.texts_to_sequences([txt])
    padded = pad_sequences(seq, maxlen=max_length)
    pred = model.predict(padded)
    try:
        label = labels[np.argmax(pred)-1] 
        answer.append(label)
    except:
        answer.append('general')
print(answer)

['schedule', 'general', 'general', 'general', 'schedule']


In [29]:
# model.save("models")

In [30]:
# from tensorflow import keras

# m = keras.models.load_model("models")

In [31]:
# answer = list()
# for txt in txt_list:
#     seq = tokenizer.texts_to_sequences([txt])
#     padded = pad_sequences(seq, maxlen=max_length)
#     pred = m.predict(padded)
#     try:
#         label = labels[np.argmax(pred)-1] 
#         answer.append(label)
#     except:
#         answer.append('general')
# print(answer)