In [1]:
import csv
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM, Dropout,Embedding, Bidirectional
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('arabic'))

2022-07-20 22:53:31.876658: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-20 22:53:31.876694: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aashrafh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
vocab_size = 5000
embedding_dim = 128
max_length = 200
oov_tok = '<OOV>' #  Out of Vocabulary
training_portion = 0.8

In [4]:
articles = []
labels = []

with open("Data/intent.csv", 'r',encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        try:
            labels.append(row[1])   
        except:
            continue
        article = row[0]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)

In [5]:
len(labels),len(articles)

(8289, 8289)

In [6]:
print(labels[0])

weather


In [7]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_labels = labels[train_size:]

In [8]:
print('train_articles' ,len(train_articles))
print('train_labels', len(train_labels))
print('validation_articles', len(validation_articles))
print('validation_labels', len(validation_labels))

train_articles 6631
train_labels 6631
validation_articles 1658
validation_labels 1658


In [9]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

In [10]:
import pickle
filename = f'../../utils/tokenizer.sav'
pickle.dump(tokenizer, open(filename, 'wb'))

In [11]:
word_index

{'<OOV>': 1,
 'هل': 2,
 'ما': 3,
 'الطقس': 4,
 'الساعة': 5,
 '،': 6,
 'الخاص': 7,
 'سيكون': 8,
 'يوم': 9,
 'يمكنك': 10,
 'التقويم': 11,
 'اليوم': 12,
 'موعد': 13,
 'مساءً': 14,
 'الأسبوع': 15,
 'كيف': 16,
 'التحقق': 17,
 'من': 18,
 'فضلك': 19,
 'وقت': 20,
 'أريد': 21,
 'ستكون': 22,
 'توقعات': 23,
 'الحدث': 24,
 'بي': 25,
 '30': 26,
 'الجو': 27,
 'ماذا': 28,
 'إضافة': 29,
 'حدد': 30,
 'موعدًا': 31,
 'قل': 32,
 'لدي': 33,
 'طبيب': 34,
 'أود': 35,
 'مارس': 36,
 'أعرف': 37,
 'يمكن': 38,
 'أحتاج': 39,
 'الأحداث': 40,
 '4': 41,
 '3': 42,
 'مارس؟': 43,
 'تاريخ': 44,
 'صباحًا': 45,
 'متى': 46,
 'تخبرني': 47,
 'الظهر': 48,
 'يرجى': 49,
 'اليوم؟': 50,
 'الشهر': 51,
 'نعم': 52,
 'القادم': 53,
 'المقبل': 54,
 'درجة': 55,
 '5': 56,
 'شكرا': 57,
 'ليوم': 58,
 'أخبرني': 59,
 '11': 60,
 'الحياة': 61,
 'أكون': 62,
 'في': 63,
 'الثاني': 64,
 'تحقق': 65,
 'كم': 66,
 'عندما': 67,
 '12': 68,
 'الوقت': 69,
 'منبه': 70,
 'أنا': 71,
 'يمكنني': 72,
 'يكون': 73,
 '15': 74,
 'يجب': 75,
 'لديك': 76,
 'شكرًا': 77,

In [12]:
tokenizer.texts_to_sequences(['what will be the weather tommorrow morning ?'])

[[1, 1, 1, 1162, 1, 1, 1]]

In [13]:
train_sequences = tokenizer.texts_to_sequences(train_articles)
len(train_sequences)

6631

In [14]:
train_sequences[0]

[3, 4, 252, 54]

In [15]:
train_padded = pad_sequences(train_sequences, maxlen=max_length,)

In [16]:
len(train_padded),len(train_padded[0])

(6631, 200)

In [17]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length)

In [18]:
set(labels)

{'general', 'greeting', 'recommendation', 'schedule', 'thank', 'weather'}

In [19]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [20]:
validation_label_seq

array([[2],
       [3],
       [1],
       ...,
       [2],
       [2],
       [2]])

In [21]:
# train_labels[744], training_label_seq[744]

In [22]:
model = Sequential()
model.add(Embedding(vocab_size,embedding_dim ))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(7,activation='softmax' ))
model.summary()

2022-07-20 22:53:53.803415: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-07-20 22:53:53.803443: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-07-20 22:53:53.803464: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (aashrafh-dell): /proc/driver/nvidia/version does not exist
2022-07-20 22:53:53.803928: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         640000    
                                                                 
 dropout (Dropout)           (None, None, 128)         0         
                                                                 
 bidirectional (Bidirectiona  (None, 256)              263168    
 l)                                                              
                                                                 
 dense (Dense)               (None, 7)                 1799      
                                                                 
Total params: 904,967
Trainable params: 904,967
Non-trainable params: 0
_________________________________________________________________


In [23]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy',optimizer=opt,metrics=['accuracy'])

  super(Adam, self).__init__(name, **kwargs)


In [24]:
num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Epoch 1/10
208/208 - 76s - loss: 0.7412 - accuracy: 0.7258 - val_loss: 0.2910 - val_accuracy: 0.9180 - 76s/epoch - 366ms/step
Epoch 2/10
208/208 - 69s - loss: 0.1808 - accuracy: 0.9504 - val_loss: 0.1886 - val_accuracy: 0.9451 - 69s/epoch - 331ms/step
Epoch 3/10
208/208 - 70s - loss: 0.0878 - accuracy: 0.9747 - val_loss: 0.1732 - val_accuracy: 0.9469 - 70s/epoch - 337ms/step
Epoch 4/10
208/208 - 71s - loss: 0.0532 - accuracy: 0.9824 - val_loss: 0.2031 - val_accuracy: 0.9499 - 71s/epoch - 339ms/step
Epoch 5/10
208/208 - 70s - loss: 0.0378 - accuracy: 0.9858 - val_loss: 0.2201 - val_accuracy: 0.9463 - 70s/epoch - 335ms/step
Epoch 6/10
208/208 - 70s - loss: 0.0260 - accuracy: 0.9922 - val_loss: 0.1867 - val_accuracy: 0.9511 - 70s/epoch - 336ms/step
Epoch 7/10
208/208 - 70s - loss: 0.0246 - accuracy: 0.9917 - val_loss: 0.2007 - val_accuracy: 0.9524 - 70s/epoch - 337ms/step
Epoch 8/10
208/208 - 67s - loss: 0.0176 - accuracy: 0.9949 - val_loss: 0.1996 - val_accuracy: 0.9511 - 67s/epoch - 322

In [25]:
txt_list = ["ما حال الطقس اليوم ؟","احجز لى معاد اليوم الساعة 5","كان بوم جميل جدا"," اقترح فيلم جديد لاشاهده"]
labels = ['schedule','general','weather','recommendation','greeting','thank']
answer = list()
for txt in txt_list:
    seq = tokenizer.texts_to_sequences([txt])
    padded = pad_sequences(seq, maxlen=max_length)
    pred = model.predict(padded)
    try:
        label = labels[np.argmax(pred)-1] 
        answer.append(label)
    except:
        answer.append('general')
print(answer)

['weather', 'schedule', 'general', 'recommendation']


In [26]:
txt_list = ["انا حزين جدا","سوف ارسم قطتى حتى انسى","ابقى فكرنى بمعاد الدكتور بكرة","الجو عامل ايه فى المانيا","عايز فيلم اكشن جديد اتفرج عليه"]

answer = list()
for txt in txt_list:
    seq = tokenizer.texts_to_sequences([txt])
    padded = pad_sequences(seq, maxlen=max_length)
    pred = model.predict(padded)
    try:
        label = labels[np.argmax(pred)-1] 
        answer.append(label)
    except:
        answer.append('general')
print(answer)

['general', 'general', 'schedule', 'weather', 'recommendation']


In [27]:
model.save("models")



INFO:tensorflow:Assets written to: models/assets


INFO:tensorflow:Assets written to: models/assets


In [28]:
from tensorflow import keras

m = keras.models.load_model("models")

In [29]:
answer = list()
for txt in txt_list:
    seq = tokenizer.texts_to_sequences([txt])
    padded = pad_sequences(seq, maxlen=max_length)
    pred = m.predict(padded)
    try:
        label = labels[np.argmax(pred)-1] 
        answer.append(label)
    except:
        answer.append('general')
print(answer)

['general', 'general', 'schedule', 'weather', 'recommendation']
