In [None]:
# Import the models needed 


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) 


import tensorflow as tf

#get the version of tensorflow
print("Version: ", tf.__version__)
#eager mode

print("Eager mode: ", tf.executing_eagerly())

print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Load data

In [None]:
file = "/kaggle/input/toxic-arabic-tweets-classification/toxic arabic tweets classification.txt"
tweets = pd.read_csv(file,sep="\t")

# EDA

 ## show the head of data

In [None]:
tweets.head()

 ## show the nullable data

In [None]:
tweets.isnull().sum()

## describe the data

In [None]:
tweets.describe()

## Get the distinct values of classes of tweets

In [None]:
tweets['Class'].unique()

# Getting the number of values

In [None]:
tweets.shape

## Visualise each class

In [None]:
import matplotlib.pyplot as plt
classes= ['Abusive tweets','Normal tweets','Hate tweets']
values =[len ( tweets[tweets['Class']=='abusive'].index ),len ( tweets[tweets['Class']=='normal'].index ),len ( tweets[tweets['Class']=='normal'].index )]
plt.title('Occurrences of type of tweets')
plt.ylabel('# of Occurrences', fontsize=12)
plt.bar(classes,values)


# Data preparation

We will internsetted to prepare data by fellowing the steps:
* Decode the value of the class column
* clean the data of tweets column
* Decode the tweets column

Decode the value of the class column to make it traitable with DL algorithmes

In [None]:
def decodeValues(value):
    #'abusive', 'normal', 'hate'
    if value == 'abusive':
        return 1
    elif value == 'normal':
        return 2
    elif value == 'hate':
        return 3

#Map each class into a numerical value
tweets['Class'] = tweets['Class'].apply(decodeValues)

tweets.head()

## clean the data of tweets column
We will intressted in this section in data cleaning:
first we need to delete all the non alphabetics values: 
*  deduplicate 
*  Removing puctuations
*  Removing URL data
*  removing emojies


## removing emojies

In [None]:
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)



## split text by space

In [None]:
def split_white_space(text):
    text = text.lower().split()
    return text


## remove pnctuation

In [None]:
import string

def remove_punctuation(text):
    result = string.punctuation
    listText=[]
    for words in text:
        String =""
        for word in words:
            if word not in result:
                String+=word
            else:
                break
        if (String!="") :
            listText.append(String)    
    return listText

In [None]:
def clean_data(text):
    text = remove_emoji(text)
    #text = split_white_space(text)
    #text = remove_punctuation(text)
    return text

tweets['Tweet'] = tweets['Tweet'].apply(clean_data)

tweets.head()

# Vectorizing the words using hot encoding

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split( tweets['Tweet'], tweets['Class'], test_size=0.2)



In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

print('Number of Unique Tokens',len(tokenizer.word_index))

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
print( X_train.shape )
print(X_train,y_train)

# Apply LSTM architecture

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Flatten, MaxPooling1D, Input, Concatenate
vocab_size = 10000
embedding_dim = 1000

tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
     model = tf.keras.Sequential([
        #Word embdading layer (Input layer)
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(50),
        tf.keras.layers.Dense(embedding_dim, activation='relu'),
        tf.keras.layers.Dense(140, activation='relu'),
        tf.keras.layers.Dense(150, activation='relu'),
        #Output layer(We use softmax activation function in multiple classification)
        tf.keras.layers.Dense(4, activation="softmax")
    ])

model.compile(optimizer="adam",loss="sparse_categorical_crossentropy" , metrics=["accuracy"])
# train model normally
model.fit(X_train, y_train, epochs=50, steps_per_epoch=50)

# Calcule the accuracy

In [None]:

test_lost , test_acc = model.evaluate(X_test, y_test)


print("The accuracy of the model is:",(test_acc*100))


In [None]:
model.save(r'./LSTM.h5')

# Apply CNN architecture

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, Dense, Dropout, Flatten, MaxPooling1D, Input, Concatenate
vocab_size = 1000
embedding_dim = 100



# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
     model = tf.keras.Sequential([
        #Word embdading layer (Input layer)
        tf.keras.layers.Embedding(vocab_size, embedding_dim,input_length=200),
        Conv1D(filters=128, kernel_size=3, activation='relu',padding="valid"),
        MaxPooling1D(),
         Flatten(),
        #Output layer
        tf.keras.layers.Dense(4, activation="softmax")
    ])
model.summary()


In [None]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy" , metrics=["accuracy"])
# train model normally
model.fit(X_train, y_train, epochs=60, steps_per_epoch=50)

In [None]:

test_lost , test_acc_cnn = model.evaluate(X_test, y_test)


print("The accuracy of the CNN model is:",(test_acc_cnn*100))


In [None]:
model.save(r'./CNN.h5')

# Apply GRU architecture

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Flatten, MaxPooling1D, Input, Concatenate
vocab_size = 10000
embedding_dim = 1000

tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
     model = tf.keras.Sequential([
        #Embedding layer(input)
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.GRU(50),
        tf.keras.layers.Dense(embedding_dim, activation='relu'),
        tf.keras.layers.Dense(145, activation='relu'),
         #output layer
        tf.keras.layers.Dense(4, activation="softmax")
    ])

model.compile(optimizer="adam",loss="sparse_categorical_crossentropy" , metrics=["accuracy"])
# train model normally
model.fit(X_train, y_train, epochs=50, steps_per_epoch=50)

In [None]:
test_lost , test_acc_gru = model.evaluate(X_test, y_test)


print("The accuracy of the RNN-GRU model is:",(test_acc_gru*100))

In [None]:
model.save(r'./GRU.h5')

 # Apply RNN architecture

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Flatten, MaxPooling1D, Input, Concatenate
vocab_size = 10000
embedding_dim = 1000

tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
     model = tf.keras.Sequential([
        #embedding layer(input)
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.SimpleRNN(50),
        tf.keras.layers.Dense(embedding_dim, activation='relu'),
        tf.keras.layers.Dense(145, activation='relu'),
         #output layer
        tf.keras.layers.Dense(4, activation="softmax")
    ])

model.compile(optimizer="adam",loss="sparse_categorical_crossentropy" , metrics=["accuracy"])
# train model normally
model.fit(X_train, y_train, epochs=50, steps_per_epoch=50)

In [None]:
test_lost , test_acc_rnn = model.evaluate(X_test, y_test)


print("The accuracy of the RNN model is:",(test_acc_rnn*100))

In [None]:
model.save(r'./RNN.h5')

# Compare each architecture used

In [None]:
import matplotlib.pyplot as plt
arche= ['RNN','CNN','LSTM','GRU']
accuracy =[(test_acc_rnn*100),(test_acc_cnn*100),(test_acc*100),(test_acc_gru*100)]
plt.title('Accuracy of architectures')
plt.ylabel('architectures', fontsize=12)
plt.bar(arche,accuracy)