In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from keras.preprocessing import sequence, text
from keras.models import Model

from gensim.models.keyedvectors import KeyedVectors
from sklearn import preprocessing

from time import time
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
import seaborn as sns
import csv


from sklearn import model_selection, naive_bayes, svm, ensemble, tree
from xgboost import XGBClassifier

from sklearn.metrics import (
    classification_report as creport
)

from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from sklearn import preprocessing

# AraVec2.0: Pre-trained Arabic Word Embeddings model 
Source: https://github.com/bakrianoo/aravec/tree/master/AraVec%202.0


In [None]:
"""
Citation:
Abu Bakr Soliman, Kareem Eisa, and Samhaa R. El-Beltagy, “AraVec:
A set of Arabic Word Embedding Models for use in Arabic NLP”,
in proceedings of the 3rd International Conference on 
Arabic Computational Linguistics (ACLing 2017), Dubai, UAE, 2017.
"""
! unzip '/content/drive/MyDrive/tweets_sg_300.zip'  

Archive:  /content/drive/MyDrive/tweets_sg_300.zip
  inflating: tweets_sg_300           
  inflating: tweets_sg_300.trainables.syn1neg.npy  
  inflating: tweets_sg_300.wv.vectors.npy  


In [None]:
# Word_embedding_path
embedding_path = '/content/tweets_sg_300'           #Twitter-Skipgram model-300d(trained on 77,600,000 Arabic tweets)

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/Updated_Dataset.csv')
train_data

Unnamed: 0,Tweet,Class
0,وسخ وليس وصخ هاه مين الوسخ فينا يا نجم ...,hate
1,لا تحسبوني نسيتكم يا عبنده يا كويحة يا م...,hate
2,تحرير فلسطين اله رجاله ، وانتوا يا نسل ال...,hate
3,يا لبناني يا فضلات الاستعمار الفرنسي اللب...,hate
4,الخيانه والغدر والعماله من خصالكم نحن من ح...,hate
...,...,...
2514,يلي لهلا مو عرفان هاد يا اجدب يا اما عم يجدبها,abusive
2515,يمثل تحفة الفن والعمارة القوطية ويعد من المعال...,hate
2516,يمكن لو كان ابوك مربيك وضاربك شي كفين ...,abusive
2517,يهودي منهم وفيهم,normal


In [None]:
X = []
y = []
for data_path in ["/content/drive/MyDrive/OSACT4"]:
     with open(data_path, 'r') as f:
          for i, line in enumerate(f):
              if i == 0: continue
              else:
                  temp = line.split(',')
                  X.append(temp[0].split())
                  y.append(temp[1].replace('\n', ''))
X, y = np.array(X), np.array(y)

  # This is added back by InteractiveShellApp.init_path()


In [None]:
len(X)

2519

## Get Word2Vec

In [None]:
def get_init_parameters(path, ext=None):
    word_model = KeyedVectors.load(path).wv
    n_words = len(word_model.vocab)
    vocab_dim = word_model[word_model.index2word[0]].shape[0]
    index_dict = dict()
    for i in range(n_words):
        index_dict[word_model.index2word[i]] = i+1
    print('Number of words in the word embedding',n_words)
    #print('word_model', word_model)
    #print("index_dict",index_dict)
    return word_model, index_dict, n_words, vocab_dim

In [None]:
WORD_MODEL, index_dict, MAX_FEATURES, EMBED_SIZE = get_init_parameters(embedding_path) 

Number of words in the word embedding 331679


In [None]:
EMBED_SIZE

300

In [None]:
len(index_dict)

331679

In [None]:
def get_word_index(train_raw_text, test_raw_text, n_words):
    tokenizer = text.Tokenizer(num_words=n_words)
    tokenizer.fit_on_texts(list(train_raw_text))
    word_index = tokenizer.word_index

    return word_index

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
 word_index  = get_word_index(X,X_test,MAX_FEATURES)

In [None]:
def w2v(word_index, embedding_index, vocab_dim):
    print('Building embedding matrix...')
    dicc={}
    embedding_matrix = np.zeros((len(word_index) + 1, vocab_dim))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index.get_vector(word)
        except:
            pass
        dicc[word]= embedding_matrix[i]

    print('Embedding matrix built.') 
    #print("Word index", word_index.items())
    #print(embedding_matrix) 
    return dicc

In [None]:
dicc= w2v(word_index, WORD_MODEL, EMBED_SIZE)

Building embedding matrix...
Embedding matrix built.


In [None]:
len(dicc)

12357

In [None]:
"""
To use AraVec2.0 with the classical machine learning models, 
the average vector of all the embeddings of the tweet words is computed
"""
class MeanEmbeddingVectorizer(object):
    def __init__(self, dicc):
        self.dicc = dicc
        if len(dicc)>0:
            self.dim=300
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.dicc[w] for w in words if w in self.dicc] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

# Word Embeddings

We experimented with various classical machine learning models:

1.  SVM
2.  Random Forest
3.  XGBoost
4.  Extra Trees
5.  Decision Trees
6.  Gradient Boosting
7.  Logistic Regression









## 1. SVM Classifier

In [None]:
from sklearn.pipeline import Pipeline

svm_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("svm_w2v",  svm.SVC())])

In [None]:
time_start = time()

svm_w2v= svm_w2v.fit(X_train,y_train)

time_start = time() - time_start

print("Took : "+str(np.round(time_start, 2))+" (s)") 

Took : 1.43 (s)


In [None]:
predictions_SVM = svm_w2v.predict(X_test)    

In [None]:
print("SVM macro-averaged F1-score -> ", f1_score(y_test, predictions_SVM,average='macro'))

SVM macro-averaged F1-score ->  0.7261077027948194


In [None]:
print(creport(y_test, predictions_SVM,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.6746    0.6590    0.6667       173
     abusive     0.7117    0.7117    0.7117       163
        hate     0.7907    0.8095    0.8000       168

    accuracy                         0.7262       504
   macro avg     0.7256    0.7267    0.7261       504
weighted avg     0.7253    0.7262    0.7257       504



## 2. RandomForest Classifier

In [None]:
from sklearn.pipeline import Pipeline

RF_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("RF",   ensemble.RandomForestClassifier())])

In [None]:
time_start = time()

RF_w2v= RF_w2v.fit(X_train,y_train)

time_start = time() - time_start

print("Took : "+str(np.round(time_start, 2))+" (s)") 

Took : 2.32 (s)


In [None]:
predictions_RF = RF_w2v.predict(X_test) 

In [None]:
print("RF macro-averaged F1-score -> ",f1_score(y_test, predictions_RF,average='macro'))

RF macro-averaged F1-score ->  0.6780679968690867


In [None]:
print(creport(y_test, predictions_RF,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.5979    0.6705    0.6322       173
     abusive     0.7172    0.6380    0.6753       163
        hate     0.7333    0.7202    0.7267       168

    accuracy                         0.6766       504
   macro avg     0.6828    0.6763    0.6781       504
weighted avg     0.6817    0.6766    0.6776       504



## 3. XGBClassifier

In [None]:
XGB_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("XGB",   XGBClassifier())])

In [None]:
time_start = time()

XGB_w2v= XGB_w2v.fit(X_train, y_train)

time_start = time() - time_start

print("Took : "+str(np.round(time_start, 2))+" (s)") 

Took : 12.25 (s)


In [None]:
predictions_XGB = XGB_w2v.predict(X_test)

In [None]:
print("XGB macro-averaged F1-score -> ", f1_score(y_test, predictions_XGB,average='macro'))

XGB macro-averaged F1-score ->  0.7080182354518637


In [None]:
print(creport(y_test, predictions_XGB,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.6687    0.6416    0.6549       173
     abusive     0.6647    0.6810    0.6727       163
        hate     0.7895    0.8036    0.7965       168

    accuracy                         0.7083       504
   macro avg     0.7076    0.7087    0.7080       504
weighted avg     0.7076    0.7083    0.7078       504



## 4. ExtraTrees Classifier

In [None]:
extraTrees_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("extraTrees",   ensemble.ExtraTreesClassifier())])

In [None]:
time_start = time()

extraTrees_w2v= extraTrees_w2v.fit(X_train,y_train)

time_start = time() - time_start

print("Took : "+str(np.round(time_start, 2))+" (s)") 

Took : 0.75 (s)


In [None]:
predictions_extraTrees= extraTrees_w2v.predict(X_test)

In [None]:
print("ExtraTreesClassifier macro-averaged F1-score -> ",f1_score(y_test, predictions_extraTrees,average='macro'))

ExtraTreesClassifier macro-averaged F1-score ->  0.6811020014807546


In [None]:
print(creport(y_test, predictions_extraTrees,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.6163    0.6127    0.6145       173
     abusive     0.6606    0.6687    0.6646       163
        hate     0.7665    0.7619    0.7642       168

    accuracy                         0.6806       504
   macro avg     0.6811    0.6811    0.6811       504
weighted avg     0.6807    0.6806    0.6806       504



## 5. GradientBoosting Classifier

In [None]:
GB_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("GradientBoostingClassifier",   ensemble.GradientBoostingClassifier())])

In [None]:
time_start = time()

GB_w2v= GB_w2v.fit(X_train,y_train)

time_start = time() - time_start

print("Took : "+str(np.round(time_start, 2))+" (s)") 

Took : 52.24 (s)


In [None]:
predictions_GB= GB_w2v.predict(X_test)

In [None]:
print("GradientBoostingClassifier macro-averaged F1-score -> ", f1_score(y_test, predictions_GB,average='macro'))

GradientBoostingClassifier macro-averaged F1-score ->  0.6944927536231883


In [None]:
print(creport(y_test, predictions_GB,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.6453    0.6416    0.6435       173
     abusive     0.6728    0.6687    0.6708       163
        hate     0.7647    0.7738    0.7692       168

    accuracy                         0.6944       504
   macro avg     0.6943    0.6947    0.6945       504
weighted avg     0.6940    0.6944    0.6942       504



## 6. DecisionTree Classifier

In [None]:
DT_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("DT",   tree.DecisionTreeClassifier())])

In [None]:
time_start = time()

DT_w2v= DT_w2v.fit(X_train,y_train)

time_start = time() - time_start

print("Took : "+str(np.round(time_start, 2))+" (s)") 

Took : 0.75 (s)


In [None]:
predictions_DT= DT_w2v.predict(X_test)

In [None]:
print("DecisionTreeClassifier macro-averaged F1-score -> ",f1_score(y_test, predictions_DT,average='macro'))

DecisionTreeClassifier macro-averaged F1-score ->  0.4722745632956901


In [None]:
print(creport(y_test, predictions_DT,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.4688    0.4335    0.4505       173
     abusive     0.4427    0.5215    0.4789       163
        hate     0.5132    0.4643    0.4875       168

    accuracy                         0.4722       504
   macro avg     0.4749    0.4731    0.4723       504
weighted avg     0.4751    0.4722    0.4720       504



## 7. Logistic regression

In [None]:
from sklearn import linear_model

In [None]:
from sklearn.pipeline import Pipeline

LR_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("LR_w2v",  linear_model.LogisticRegression(multi_class='multinomial'))])

In [None]:
LR_w2v= LR_w2v.fit(X_train, y_train)

In [None]:
predictions_LR= LR_w2v.predict(X_test)

In [None]:
print("DecisionTreeClassifier macro-averaged F1-score -> ", f1_score(y_test, predictions_LR,average='macro'))

DecisionTreeClassifier macro-averaged F1-score ->  0.7059934535047657


In [None]:
print(creport(y_test, predictions_LR,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.6788    0.6474    0.6627       173
     abusive     0.6707    0.6871    0.6788       163
        hate     0.7674    0.7857    0.7765       168

    accuracy                         0.7063       504
   macro avg     0.7056    0.7067    0.7060       504
weighted avg     0.7057    0.7063    0.7058       504

