In [18]:
import matplotlib.pyplot as plt
import os
import re
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
import subprocess
import re

In [19]:
print(tf.__version__)

2.13.0


In [20]:
# Set the seed value for experiment reproducibility.
SEED = 42
EPOCHS = 100
BATCH_SIZE = 4
VALIDATION_SPLIT = 0.2
tf.random.set_seed(SEED)
# np.random.seed(SEED)

In [21]:
sentiment_dataset = tf.keras.utils.text_dataset_from_directory(
    'file:///home/alireza/Documents/Projects/context_classifier/ai/dataset/sentiment',
    batch_size=BATCH_SIZE,
    validation_split=VALIDATION_SPLIT,
    seed=SEED,
    subset='both')

print(sentiment_dataset[0].class_names)

Found 6614 files belonging to 4 classes.
Using 5292 files for training.
Using 1322 files for validation.
['negative', 'neutral', 'null', 'positive']


In [22]:
type_dataset = tf.keras.utils.text_dataset_from_directory(
    'file:///home/alireza/Documents/Projects/context_classifier/ai/dataset/type',
    batch_size=BATCH_SIZE,
    validation_split=VALIDATION_SPLIT,
    seed=SEED,
    subset='both')

print(type_dataset[0].class_names)

Found 6614 files belonging to 8 classes.
Using 5292 files for training.
Using 1322 files for validation.
['announcement', 'command', 'greeting', 'null', 'opinion', 'other', 'question', 'statement']


In [23]:
topic_dataset = tf.keras.utils.text_dataset_from_directory(
    'file:///home/alireza/Documents/Projects/context_classifier/ai/dataset/topic',
    batch_size=BATCH_SIZE,
    validation_split=VALIDATION_SPLIT,
    seed=SEED,
    subset='both')

print(topic_dataset[0].class_names)

Found 6614 files belonging to 10 classes.
Using 5292 files for training.
Using 1322 files for validation.
['books', 'comments', 'education', 'game', 'null', 'other', 'person', 'sports', 'the_media', 'toxic']


In [24]:
print(sentiment_dataset[0].class_names)
print(type_dataset[0].class_names)
print(topic_dataset[0].class_names)

['negative', 'neutral', 'null', 'positive']
['announcement', 'command', 'greeting', 'null', 'opinion', 'other', 'question', 'statement']
['books', 'comments', 'education', 'game', 'null', 'other', 'person', 'sports', 'the_media', 'toxic']


In [25]:
def custom_standardization(input_data):
  input_data = tf.strings.regex_replace(tf.strings.regex_replace(tf.strings.regex_replace(input_data, '<br>', ' '), '<br/>', ' '), '<br />', ' ')
  input_data = tf.strings.lower(input_data)

  return input_data

max_features = 20000
sequence_length = 1000

unique_tokens = set()

datasets = [
    sentiment_dataset[0],
    sentiment_dataset[1],
    type_dataset[0],
    type_dataset[1],
    topic_dataset[0],
    topic_dataset[1]
]
# def custom_split(input_data):
#    return input_data.split("\U")

for dataset in datasets:
    vectorize_layer = tf.keras.layers.TextVectorization(
      standardize=custom_standardization,
      max_tokens=max_features,
      output_mode='int',
      output_sequence_length=sequence_length)

    # Use adapt to update the unique_tokens set
    vectorize_layer.adapt(dataset.map(lambda x, y: x))
    print(list(vectorize_layer.get_vocabulary()))
    unique_tokens.update(vectorize_layer.get_vocabulary())

# Create a new TextVectorization layer with the merged vocabulary
merged_vectorize_layer = tf.keras.layers.TextVectorization(
  standardize=custom_standardization,
  max_tokens=max_features,
  output_mode='int',
  output_sequence_length=sequence_length)
# Adapt the new layer with the merged vocabulary
merged_vectorize_layer.adapt(list(unique_tokens))

print(len(merged_vectorize_layer.get_vocabulary()))
print(merged_vectorize_layer.get_vocabulary())

['', '[UNK]', 'و', 'میا', 'که', 'کوروش', 'خیلی', 'ویدیو', 'از', 'رو', 'به', 'تو', 'با', 'من', 'این', 'یه', 'های', 'ایمان', 'بازی', '<a', 'عالی', 'خوب', 'هم', 'بود', 'ها', 'دارم', 'مرسی', 'شما', 'ولی', 'فقط', 'همیشه', '❤', 'میشه', 'ویدئو', 'تا', 'بهترین', 'بسکتبال', 'روز', 'بعد', 'چرا', 'ولاگ', 'واقعا', 'ما', 'کامنت', 'اون', 'هفته', 'مثل', 'دیگه', 'you', 'داره', 'ی', 'وقتی', 'love', 'دوست', 'چقدر', 'همه', 'بیشتر', 'میکنم', 'اول', 'یک', 'برای', 'در', 'هر', 'یا', 'لایک', 'ای', 'خواهر', 'دوستون', 'اگه', 'انرژی', 'کنید', 'کورش', 'قشنگ', 'شروع', '😂', 'الان', 'منم', 'سلام', 'حس', 'وای', 'لطفا', 'باید', 'تابستون', 'توی', '❤❤', 'می', 'بچه', 'باشه', 'حال', 'i', 'شده', 'کن', 'اینکه', 'زندگی', 'چه', 'mia', 'کنم', 'دو', 'اولین', 'هاتون', 'جدید', 'دیدن', '❤❤❤', 'انقدر', 'فوتبال', 'شد', 'تر', 'برین', 'day', 'ک', 'عاشق', 'سال', '،', 'چقد', 'هست', 'دیدم', 'دوباره', 'خوشحال', 'the', 'چی', 'میبینم', 'ترین', 'نیست', 'سه', 'دلم', 'دقیقه', 'بار', 'فکر', 'بزارید', 'افسردگی', 'یوتیوب', 'کل', 'چون', 'تتو', 'یک

In [26]:
for text_batch, label_batch in sentiment_dataset[0]:
    for i in range(text_batch.shape[0]):
        print("Review", text_batch.numpy()[i].decode('utf-8'))
        print("Label", label_batch.numpy()[i])


Review ممه هاتو بخورم میاجونم❤
Label 2
Review قلبای من عشقای من
Label 2
Review ارایشی که میا میکنه: <br>ریمل.زدافتاب.رژ.رژگونه تمامممم😂❤
Label 2
Review My love
Label 2
Review گیس بهت میاد میا زیبا تر شدی
Label 2
Review مرسی میا خیلی خوبی🥲تو بدترین حالت تمام عمرمم ولی واس چندلحظه حالم خوب شد فک کنم منم افسردگی تابستونی گرفته بودم<br>راستی کوروش چ دستخط قشنگی داره
Label 2
Review good
Label 2
Review @Nazanin گوه پیچه😂
Label 2
Review میا و کوروش قشنگم  خیلی دوستون دارم میا و کوروش شما منو خیلی خوشحال میکنید3&gt;&gt;
Label 2
Review گیم چرت بهزی کنید با مهمون❤
Label 2
Review دو عدد با انرژی❤
Label 2
Review کوروش جر زن 🤣😭
Label 3
Review کی دلش برا موریس تنگ شده
Label 2
Review ilysm
Label 2
Review میا زیر چشمات خیلی گود رفته عزیزم ،ب خودت سخت نگیر و استراحت کن ب خودت برس💋💋💋دوست دااااارم❤
Label 2
Review ۱هفته پیش تولدم بود بابت کادو ممنون❤❤❤❤❤❤🧡🧡🧡🧡🧡💛💛💛💛💛💚💚💚💚💙💙💙💙💙💙💜💜💜💙🤎🤎🖤💙💙💙💙💙💚💚💚
Label 2
Review <a href="https://www.youtube.com/watch?v=sZo4TmfXiYY&amp;t=9m33s">9:33</a> ایمان عشقی وافعا بی نظیرید❤

In [27]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return merged_vectorize_layer(text), label

In [28]:
# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(sentiment_dataset[0]))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", sentiment_dataset[0].class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b'\xd8\xa7\xd8\xb1\xd8\xa7\xdb\x8c\xd8\xb4\xdb\x8c \xda\xa9\xd9\x87 \xd9\x85\xdb\x8c\xd8\xa7 \xd9\x85\xdb\x8c\xda\xa9\xd9\x86\xd9\x87: <br>\xd8\xb1\xdb\x8c\xd9\x85\xd9\x84.\xd8\xb2\xd8\xaf\xd8\xa7\xd9\x81\xd8\xaa\xd8\xa7\xd8\xa8.\xd8\xb1\xda\x98.\xd8\xb1\xda\x98\xda\xaf\xd9\x88\xd9\x86\xd9\x87 \xd8\xaa\xd9\x85\xd8\xa7\xd9\x85\xd9\x85\xd9\x85\xd9\x85\xf0\x9f\x98\x82\xe2\x9d\xa4', shape=(), dtype=string)
Label null
Vectorized review (<tf.Tensor: shape=(1, 1000), dtype=int64, numpy=
array([[10400,  1528,  4336,  3756,  6485,  8289,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
 

In [29]:
sentiment_dataset = tf.keras.utils.text_dataset_from_directory(
    'file:///home/alireza/Documents/Projects/context_classifier/ai/dataset_non_null/sentiment',
    batch_size=BATCH_SIZE,
    validation_split=VALIDATION_SPLIT,
    seed=SEED,
    subset='both')
type_dataset = tf.keras.utils.text_dataset_from_directory(
    'file:///home/alireza/Documents/Projects/context_classifier/ai/dataset_non_null/type',
    batch_size=BATCH_SIZE,
    validation_split=VALIDATION_SPLIT,
    seed=SEED,
    subset='both')
topic_dataset = tf.keras.utils.text_dataset_from_directory(
    'file:///home/alireza/Documents/Projects/context_classifier/ai/dataset_non_null/topic',
    batch_size=BATCH_SIZE,
    validation_split=VALIDATION_SPLIT,
    seed=SEED,
    subset='both')

Found 280 files belonging to 3 classes.
Using 224 files for training.
Using 56 files for validation.
Found 218 files belonging to 7 classes.
Using 175 files for training.
Using 43 files for validation.
Found 214 files belonging to 9 classes.
Using 172 files for training.
Using 42 files for validation.


In [30]:
print(sentiment_dataset[0].class_names)
print(type_dataset[0].class_names)
print(topic_dataset[0].class_names)

['negative', 'neutral', 'positive']
['announcement', 'command', 'greeting', 'opinion', 'other', 'question', 'statement']
['books', 'comments', 'education', 'game', 'other', 'person', 'sports', 'the_media', 'toxic']


In [31]:
sentiment_train_ds = sentiment_dataset[0].map(vectorize_text)
sentiment_val_ds = sentiment_dataset[1].map(vectorize_text)

type_train_ds = type_dataset[0].map(vectorize_text)
type_val_ds = type_dataset[1].map(vectorize_text)

topic_train_ds = topic_dataset[0].map(vectorize_text)
topic_val_ds = topic_dataset[1].map(vectorize_text)

In [32]:
AUTOTUNE = tf.data.AUTOTUNE

sentiment_train_ds = sentiment_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
sentiment_val_ds = sentiment_val_ds.cache().prefetch(buffer_size=AUTOTUNE)

type_train_ds = type_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
type_val_ds = type_val_ds.cache().prefetch(buffer_size=AUTOTUNE)

topic_train_ds = topic_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
topic_val_ds = topic_val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [33]:
embedding_dim = 16

In [34]:
sentiment_model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(len(sentiment_dataset[0].class_names)),
  layers.Activation('sigmoid')])

sentiment_model.summary()
sentiment_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

sentiment_history = sentiment_model.fit(sentiment_train_ds, validation_data=sentiment_val_ds, epochs=EPOCHS)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 16)          320016    
                                                                 
 dropout_2 (Dropout)         (None, None, 16)          0         
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 3)                 51        
                                                                 
 activation_1 (Activation)   (None, 3)                 0         
                                                      

In [35]:
type_model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(len(type_dataset[0].class_names)),
  layers.Activation('sigmoid')])

type_model.summary()
type_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

type_history = type_model.fit(sentiment_train_ds, validation_data=sentiment_val_ds, epochs=EPOCHS)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 16)          320016    
                                                                 
 dropout_4 (Dropout)         (None, None, 16)          0         
                                                                 
 global_average_pooling1d_2  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_5 (Dropout)         (None, 16)                0         
                                                                 
 dense_2 (Dense)             (None, 7)                 119       
                                                                 
 activation_2 (Activation)   (None, 7)                 0         
                                                      

In [36]:
topic_model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(len(topic_dataset[0].class_names)),
  layers.Activation('sigmoid')])

topic_model.summary()
topic_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

topic_history = topic_model.fit(sentiment_train_ds, validation_data=sentiment_val_ds, epochs=EPOCHS)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 16)          320016    
                                                                 
 dropout_6 (Dropout)         (None, None, 16)          0         
                                                                 
 global_average_pooling1d_3  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_7 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 9)                 153       
                                                                 
 activation_3 (Activation)   (None, 9)                 0         
                                                      

In [37]:
sentiment_history_dict = sentiment_history.history
sentiment_history_dict.keys()

type_history_dict = type_history.history
type_history_dict.keys()

topic_history_dict = topic_history.history
topic_history_dict.keys()

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])

In [38]:
# acc = history_dict['accuracy']
# val_acc = history_dict['accuracy']
# loss = history_dict['loss']
# val_loss = history_dict['val_loss']

# epochs = range(1, len(acc) + 1)

# # "bo" is for "blue dot"
# plt.plot(epochs, loss, 'bo', label='Training loss')
# # b is for "solid blue line"
# plt.plot(epochs, val_loss, 'b', label='Validation loss')
# plt.title('Training and validation loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()

# plt.show()

In [39]:
# plt.plot(epochs, acc, 'bo', label='Training acc')
# plt.plot(epochs, val_acc, 'b', label='Validation acc')
# plt.title('Training and validation accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend(loc='lower right')

# plt.show()

In [40]:
export_model = tf.keras.Sequential([
  merged_vectorize_layer,
  sentiment_model,
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

examples = [
  "هیکل میا خیلی قشنگه", #"positive, null, null"
  "Love you guys❤️❤️", #"positive, null, null"
  "کروش کلی خطا کردی😂 رانینگ کردی توپ رو هی گرفتی دوباره دیریب میزدی😂👌" #"positive, opinion, the_media",
]

export_model.predict(examples)




array([[0.20097122, 0.4582185 , 0.7117907 ],
       [0.20119534, 0.45758572, 0.71216637],
       [0.19766891, 0.45453098, 0.7171817 ]], dtype=float32)

In [41]:
sentiment_model.save('./models/model5/sentiment/model.h5')
sentiment_model.save('./models/model5/sentiment/model.keras')
# tensorflowjs_converter --input_format=keras ./ai/models/model5/sentiment/model.h5 ./ai/models/model5/sentiment/

# Define the command as a list of arguments
command = [
    'tensorflowjs_converter',
    '--input_format=keras',
    './models/model5/sentiment/model.h5',
    './models/model5/sentiment/',
]

# Run the command
try:
    subprocess.run(command, check=True, shell=False)
    print("Model conversion completed successfully.")
except subprocess.CalledProcessError as e:
    print("Model conversion failed. Error:", e)

  saving_api.save_model(


Model conversion completed successfully.


In [42]:
type_model.save('./models/model5/type/model.h5')
type_model.save('./models/model5/type/model.keras')
# tensorflowjs_converter --input_format=keras ./ai/models/model5/sentiment/model.h5 ./ai/models/model5/sentiment/

# Define the command as a list of arguments
command = [
    'tensorflowjs_converter',
    '--input_format=keras',
    './models/model5/type/model.h5',
    './models/model5/type/',
]

# Run the command
try:
    subprocess.run(command, check=True, shell=False)
    print("Model conversion completed successfully.")
except subprocess.CalledProcessError as e:
    print("Model conversion failed. Error:", e)

  saving_api.save_model(


Model conversion completed successfully.


In [43]:
topic_model.save('./models/model5/topic/model.h5')
topic_model.save('./models/model5/topic/model.keras')
# tensorflowjs_converter --input_format=keras ./ai/models/model5/sentiment/model.h5 ./ai/models/model5/sentiment/

# Define the command as a list of arguments
command = [
    'tensorflowjs_converter',
    '--input_format=keras',
    './models/model5/topic/model.h5',
    './models/model5/topic/',
]

# Run the command
try:
    subprocess.run(command, check=True, shell=False)
    print("Model conversion completed successfully.")
except subprocess.CalledProcessError as e:
    print("Model conversion failed. Error:", e)

  saving_api.save_model(


Model conversion completed successfully.
