In [1]:
import csv
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM, Dropout,Embedding, Bidirectional
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('arabic'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
oov_tok = '<OOV>' #  Out of Vocabulary
training_portion = 0.8

In [3]:
articles = []
labels = []

with open("Data/arabic_main.csv", 'r',encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[1])
        article = row[0]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append(article)

In [4]:
len(labels),len(articles)

(3376, 3376)

In [5]:
print(labels[0])

movies


In [6]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_labels = labels[train_size:]

In [7]:
print('train_articles' ,len(train_articles))
print('train_labels', len(train_labels))
print('validation_articles', len(validation_articles))
print('validation_labels', len(validation_labels))

train_articles 2700
train_labels 2700
validation_articles 676
validation_labels 676


In [8]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

In [9]:
import pickle
filename = f'../../../utils/recomm_tokenizer.sav'
pickle.dump(tokenizer, open(filename, 'wb'))

In [10]:
word_index

{'<OOV>': 1,
 'فيلم': 2,
 '،': 3,
 'مشاهدة': 4,
 'أريد': 5,
 'يمكنك': 6,
 'هل': 7,
 'عرض': 8,
 'أود': 9,
 'ما': 10,
 'يمكنني': 11,
 'أقرب': 12,
 'وقت': 13,
 'الأفلام': 14,
 'الفيلم': 15,
 'الأبعاد': 16,
 'نوع': 17,
 'البحث': 18,
 'أفلام': 19,
 'الفيلم؟': 20,
 'أين': 21,
 'ثلاثي': 22,
 'فضلك': 23,
 'ابحث': 24,
 'تجد': 25,
 'أنا': 26,
 'مسرح': 27,
 'أرغب': 28,
 'لمشاهدته': 29,
 'سان': 30,
 'العثور': 31,
 'يتم': 32,
 'مشاهدته': 33,
 'imax': 34,
 'شيء': 35,
 'سينما': 36,
 'فيلمًا': 37,
 'يوم': 38,
 'في': 39,
 'أعطني': 40,
 'من': 41,
 'مارس': 42,
 'أحتاج': 43,
 'century': 44,
 'amc': 45,
 'منتظم': 46,
 'العرض': 47,
 'الثاني': 48,
 'أشاهد': 49,
 'مكان': 50,
 'مطعم': 51,
 'أبحث': 52,
 'الشهر': 53,
 'يبدو': 54,
 'طريق': 55,
 'نعم': 56,
 'اليوم': 57,
 'الاتجاهات': 58,
 'أجد': 59,
 'مركز': 60,
 'أفضل': 61,
 'سيكون': 62,
 'الأسبوع': 63,
 'يمكن': 64,
 'الحصول': 65,
 'الأول': 66,
 'لمشاهدة': 67,
 'عنوان': 68,
 'عرضها': 69,
 'يوجد': 70,
 'اريد': 71,
 'أحب': 72,
 'يكون': 73,
 'مواعيد': 74,
 'محطة': 7

In [11]:
tokenizer.texts_to_sequences(['what will be the weather tommorrow morning ?'])

[[1, 1, 1, 105, 1, 1, 1]]

In [12]:
train_sequences = tokenizer.texts_to_sequences(train_articles)
len(train_sequences)

2700

In [13]:
train_sequences[0]

[187, 776, 539, 70, 481]

In [14]:
train_padded = pad_sequences(train_sequences, maxlen=max_length,)

In [15]:
len(train_padded),len(train_padded[0])

(2700, 200)

In [16]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length)

In [17]:
set(labels)

{'locations', 'movies'}

In [18]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [19]:
validation_label_seq

array([[1],
       [1],
       [2],
       [2],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [2],
       [1],
       [1],
       [2],
       [2],
       [1],
       [1],
       [2],
       [2],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [2],
       [2],
       [1],
       [2],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [1],
       [1],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [2],
       [2],
       [2],
       [2],
       [1],
       [1],
       [2],
       [1],
       [2],
       [1],
       [2],
       [1],
       [2],
       [2],
       [1],
       [1],
       [2],
       [2],
       [1],
       [2],
       [2],
       [2],
       [2],
    

In [20]:
# train_labels[744], training_label_seq[744]

In [21]:
model = Sequential()
model.add(Embedding(vocab_size,embedding_dim ))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(8,activation='softmax' ))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          320000    
                                                                 
 dropout (Dropout)           (None, None, 64)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 128)              66048     
 l)                                                              
                                                                 
 dense (Dense)               (None, 8)                 1032      
                                                                 
Total params: 387,080
Trainable params: 387,080
Non-trainable params: 0
_________________________________________________________________


In [22]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy',optimizer=opt,metrics=['accuracy'])

  super(Adam, self).__init__(name, **kwargs)


In [23]:
num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Epoch 1/10
85/85 - 20s - loss: 0.7669 - accuracy: 0.7078 - val_loss: 0.4376 - val_accuracy: 0.9334 - 20s/epoch - 234ms/step
Epoch 2/10
85/85 - 12s - loss: 0.1446 - accuracy: 0.9578 - val_loss: 0.0518 - val_accuracy: 0.9837 - 12s/epoch - 138ms/step
Epoch 3/10
85/85 - 12s - loss: 0.0341 - accuracy: 0.9889 - val_loss: 0.0310 - val_accuracy: 0.9867 - 12s/epoch - 140ms/step
Epoch 4/10
85/85 - 14s - loss: 0.0174 - accuracy: 0.9944 - val_loss: 0.0269 - val_accuracy: 0.9911 - 14s/epoch - 164ms/step
Epoch 5/10
85/85 - 15s - loss: 0.0097 - accuracy: 0.9974 - val_loss: 0.0247 - val_accuracy: 0.9911 - 15s/epoch - 178ms/step
Epoch 6/10
85/85 - 15s - loss: 0.0069 - accuracy: 0.9981 - val_loss: 0.0237 - val_accuracy: 0.9911 - 15s/epoch - 173ms/step
Epoch 7/10
85/85 - 14s - loss: 0.0052 - accuracy: 0.9981 - val_loss: 0.0258 - val_accuracy: 0.9911 - 14s/epoch - 169ms/step
Epoch 8/10
85/85 - 14s - loss: 0.0030 - accuracy: 0.9993 - val_loss: 0.0217 - val_accuracy: 0.9941 - 14s/epoch - 168ms/step
Epoch 9/

In [24]:
txt_list = ["عايز اشترى هدوم من مكان قريب","جبلى فيلم اشوفه مع العيلة","اية اقرب مستشفى انا تعبان","اقترح فيلم كوميدى جديد لاشاهده"]
labels = ['movies','locations']
answer = list()
for txt in txt_list:
    seq = tokenizer.texts_to_sequences([txt])
    padded = pad_sequences(seq, maxlen=max_length)
    pred = model.predict(padded)
    try:
        label = labels[np.argmax(pred)-1] 
        answer.append(label)
    except:
        answer.append('general')
print(answer)

['locations', 'movies', 'locations', 'movies']


In [25]:
model.save("movie_location_model")



INFO:tensorflow:Assets written to: movie_location_model\assets


INFO:tensorflow:Assets written to: movie_location_model\assets


In [26]:
from tensorflow import keras

m = keras.models.load_model("movie_location_model")

In [27]:
answer = list()
for txt in txt_list:
    seq = tokenizer.texts_to_sequences([txt])
    padded = pad_sequences(seq, maxlen=max_length)
    pred = m.predict(padded)
    try:
        label = labels[np.argmax(pred)-1] 
        answer.append(label)
    except:
        answer.append('general')
print(answer)

['locations', 'movies', 'locations', 'movies']
