In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from keras.preprocessing import sequence, text
from keras.models import Model

from gensim.models.keyedvectors import KeyedVectors
from sklearn import preprocessing

from time import time
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
import seaborn as sns
import csv


from sklearn import model_selection, naive_bayes, svm, ensemble, tree
from xgboost import XGBClassifier

from sklearn.metrics import (
    classification_report as creport
)

from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from sklearn import preprocessing

# AraVec2.0: Pre-trained Arabic Word Embeddings model 
Source: https://github.com/bakrianoo/aravec/tree/master/AraVec%202.0


In [None]:
"""
Citation:
Abu Bakr Soliman, Kareem Eisa, and Samhaa R. El-Beltagy, “AraVec:
A set of Arabic Word Embedding Models for use in Arabic NLP”,
in proceedings of the 3rd International Conference on 
Arabic Computational Linguistics (ACLing 2017), Dubai, UAE, 2017.
"""
! unzip '/content/drive/MyDrive/tweets_sg_300.zip'  

Archive:  /content/drive/MyDrive/tweets_sg_300.zip
  inflating: tweets_sg_300           
  inflating: tweets_sg_300.trainables.syn1neg.npy  
  inflating: tweets_sg_300.wv.vectors.npy  


In [None]:
# Word_embedding_path
embedding_path = '/content/tweets_sg_300'           #Twitter-Skipgram model-300d(trained on 77,600,000 Arabic tweets)

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
train_data

Unnamed: 0,Tweet,Class
0,وزير الخارجية اللبناني جبران باسيل قال في سلسل...,normal
1,سورية بلد الحضارات تربطها بعلية او بحيوان,normal
2,اخي الحاج اذا شعرت انك محرجا من الانتقادات لتص...,normal
3,ما فيك تعيش بلا ما تكب فتن ليل نهار وبكرة قلهم...,normal
4,هذا البطل الذي قاتل وجاذف بحياته لتحيا انت يا ...,abusive
...,...,...
4671,كول هوا مرة تانيي وحلوا عن طيزو وطيزنا ومقلعين...,abusive
4672,رئيس روحي؟ تروح روحك انت وكل مين شدّ عمشدّك مش...,abusive
4673,إذا أرادت إسرائيل أن تضمن أمنها وهو حق عليها ا...,normal
4674,خليك بحالك يا نعيمي على أساس أنت مش مرتزق و طب...,abusive


In [None]:
X = []
y = []
for data_path in ["/content/drive/MyDrive/OSACT4"]:
     with open(data_path, 'r') as f:
          for i, line in enumerate(f):
              if i == 0: continue
              else:
                  temp = line.split(',')
                  X.append(temp[0].split())
                  y.append(temp[1].replace('\n', ''))
X, y = np.array(X), np.array(y)

  # This is added back by InteractiveShellApp.init_path()


In [None]:
len(X)

4676

## Get Word2Vec

In [None]:
def get_init_parameters(path, ext=None):
    word_model = KeyedVectors.load(path).wv
    n_words = len(word_model.vocab)
    vocab_dim = word_model[word_model.index2word[0]].shape[0]
    index_dict = dict()
    for i in range(n_words):
        index_dict[word_model.index2word[i]] = i+1
    print('Number of words in the word embedding',n_words)
    #print('word_model', word_model)
    #print("index_dict",index_dict)
    return word_model, index_dict, n_words, vocab_dim

In [None]:
WORD_MODEL, index_dict, MAX_FEATURES, EMBED_SIZE = get_init_parameters(embedding_path) 

Number of words in the word embedding 331679


In [None]:
EMBED_SIZE

300

In [None]:
len(index_dict)

331679

In [None]:
def get_word_index(train_raw_text, test_raw_text, n_words):
    tokenizer = text.Tokenizer(num_words=n_words)
    tokenizer.fit_on_texts(list(train_raw_text))
    word_index = tokenizer.word_index

    return word_index

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
 word_index  = get_word_index(X,X_test,MAX_FEATURES)

In [None]:
def w2v(word_index, embedding_index, vocab_dim):
    print('Building embedding matrix...')
    dicc={}
    embedding_matrix = np.zeros((len(word_index) + 1, vocab_dim))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index.get_vector(word)
        except:
            pass
        dicc[word]= embedding_matrix[i]

    print('Embedding matrix built.') 
    #print("Word index", word_index.items())
    #print(embedding_matrix) 
    return dicc

In [None]:
dicc= w2v(word_index, WORD_MODEL, EMBED_SIZE)

Building embedding matrix...
Embedding matrix built.


In [None]:
len(dicc)

16522

In [None]:
"""
To use AraVec2.0 with the classical machine learning models, 
the average vector of all the embeddings of the tweet words is computed
"""
class MeanEmbeddingVectorizer(object):
    def __init__(self, dicc):
        self.dicc = dicc
        if len(dicc)>0:
            self.dim=300
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.dicc[w] for w in words if w in self.dicc] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

# Word Embeddings

We experimented with various classical machine learning models:

1.  SVM
2.  Random Forest
3.  XGBoost
4.  Extra Trees
5.  Decision Trees
6.  Gradient Boosting
7.  Logistic Regression









## 1. SVM Classifier

In [None]:
from sklearn.pipeline import Pipeline

svm_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("svm_w2v",  svm.SVC())])

In [None]:
time_start = time()

svm_w2v= svm_w2v.fit(X_train,y_train)

time_start = time() - time_start

print("Took : "+str(np.round(time_start, 2))+" (s)") 

Took : 5.26 (s)


In [None]:
predictions_SVM = svm_w2v.predict(X_test)    

In [None]:
print("SVM macro-averaged F1-score -> ", f1_score(y_test, predictions_SVM,average='macro'))

SVM macro-averaged F1-score ->  0.5647011772266355


In [None]:
print(creport(y_test, predictions_SVM,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.7269    0.6250    0.6721       264
     abusive     0.7143    0.0769    0.1389        65
        hate     0.8234    0.9522    0.8831       607

    accuracy                         0.7991       936
   macro avg     0.7548    0.5514    0.5647       936
weighted avg     0.7886    0.7991    0.7719       936



## 2. RandomForest Classifier

In [None]:
from sklearn.pipeline import Pipeline

RF_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("RF",   ensemble.RandomForestClassifier())])

In [None]:
time_start = time()

RF_w2v= RF_w2v.fit(X_train,y_train)

time_start = time() - time_start

print("Took : "+str(np.round(time_start, 2))+" (s)") 

Took : 5.33 (s)


In [None]:
predictions_RF = RF_w2v.predict(X_test) 

In [None]:
print("RF macro-averaged F1-score -> ",f1_score(y_test, predictions_RF,average='macro'))

RF macro-averaged F1-score ->  0.47154880163986124


In [None]:
print(creport(y_test, predictions_RF,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.6902    0.4811    0.5670       264
     abusive     0.0000    0.0000    0.0000        65
        hate     0.7660    0.9489    0.8477       607

    accuracy                         0.7511       936
   macro avg     0.4854    0.4767    0.4715       936
weighted avg     0.6914    0.7511    0.7096       936



  _warn_prf(average, modifier, msg_start, len(result))


## 3. XGBClassifier

In [None]:
XGB_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("XGB",   XGBClassifier())])

In [None]:
time_start = time()

XGB_w2v= XGB_w2v.fit(X_train, y_train)

time_start = time() - time_start

print("Took : "+str(np.round(time_start, 2))+" (s)") 

Took : 22.43 (s)


In [None]:
predictions_XGB = XGB_w2v.predict(X_test)

In [None]:
print("XGB macro-averaged F1-score -> ", f1_score(y_test, predictions_XGB,average='macro'))

XGB macro-averaged F1-score ->  0.5475709399821543


In [None]:
print(creport(y_test, predictions_XGB,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.7018    0.5795    0.6349       264
     abusive     0.5000    0.0769    0.1333        65
        hate     0.8121    0.9473    0.8745       607

    accuracy                         0.7831       936
   macro avg     0.6713    0.5346    0.5476       936
weighted avg     0.7594    0.7831    0.7555       936



## 4. ExtraTrees Classifier

In [None]:
extraTrees_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("extraTrees",   ensemble.ExtraTreesClassifier())])

In [None]:
time_start = time()

extraTrees_w2v= extraTrees_w2v.fit(X_train,y_train)

time_start = time() - time_start

print("Took : "+str(np.round(time_start, 2))+" (s)") 

Took : 1.43 (s)


In [None]:
predictions_extraTrees= extraTrees_w2v.predict(X_test)

In [None]:
print("ExtraTreesClassifier macro-averaged F1-score -> ",f1_score(y_test, predictions_extraTrees,average='macro'))

ExtraTreesClassifier macro-averaged F1-score ->  0.46720500430353845


In [None]:
print(creport(y_test, predictions_extraTrees,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.7256    0.4508    0.5561       264
     abusive     0.0000    0.0000    0.0000        65
        hate     0.7552    0.9605    0.8455       607

    accuracy                         0.7500       936
   macro avg     0.4936    0.4704    0.4672       936
weighted avg     0.6944    0.7500    0.7052       936



  _warn_prf(average, modifier, msg_start, len(result))


## 5. GradientBoosting Classifier

In [None]:
GB_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("GradientBoostingClassifier",   ensemble.GradientBoostingClassifier())])

In [None]:
time_start = time()

GB_w2v= GB_w2v.fit(X_train,y_train)

time_start = time() - time_start

print("Took : "+str(np.round(time_start, 2))+" (s)") 

Took : 122.7 (s)


In [None]:
predictions_GB= GB_w2v.predict(X_test)

In [None]:
print("GradientBoostingClassifier macro-averaged F1-score -> ", f1_score(y_test, predictions_GB,average='macro'))

GradientBoostingClassifier macro-averaged F1-score ->  0.5727337832852275


In [None]:
print(creport(y_test, predictions_GB,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.6824    0.6023    0.6398       264
     abusive     0.6667    0.1231    0.2078        65
        hate     0.8177    0.9308    0.8706       607

    accuracy                         0.7821       936
   macro avg     0.7222    0.5521    0.5727       936
weighted avg     0.7690    0.7821    0.7595       936



## 6. DecisionTree Classifier

In [None]:
DT_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("DT",   tree.DecisionTreeClassifier())])

In [None]:
time_start = time()

DT_w2v= DT_w2v.fit(X_train,y_train)

time_start = time() - time_start

print("Took : "+str(np.round(time_start, 2))+" (s)") 

Took : 1.59 (s)


In [None]:
predictions_DT= DT_w2v.predict(X_test)

In [None]:
print("DecisionTreeClassifier macro-averaged F1-score -> ",f1_score(y_test, predictions_DT,average='macro'))

DecisionTreeClassifier macro-averaged F1-score ->  0.42776292503333097


In [None]:
print(creport(y_test, predictions_DT,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.4225    0.4545    0.4380       264
     abusive     0.1039    0.1231    0.1127        65
        hate     0.7530    0.7133    0.7327       607

    accuracy                         0.5994       936
   macro avg     0.4265    0.4303    0.4278       936
weighted avg     0.6147    0.5994    0.6065       936



## 7. Logistic regression

In [None]:
from sklearn import linear_model

In [None]:
from sklearn.pipeline import Pipeline

LR_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(dicc)),
    ("LR_w2v",  linear_model.LogisticRegression(multi_class='multinomial'))])

In [None]:
LR_w2v= LR_w2v.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
predictions_LR= LR_w2v.predict(X_test)

In [None]:
print("DecisionTreeClassifier macro-averaged F1-score -> ", f1_score(y_test, predictions_LR,average='macro'))

DecisionTreeClassifier macro-averaged F1-score ->  0.5561613014884044


In [None]:
print(creport(y_test, predictions_LR,target_names=["normal", "abusive", "hate"],digits=4))

              precision    recall  f1-score   support

      normal     0.6708    0.6098    0.6389       264
     abusive     0.3684    0.1077    0.1667        65
        hate     0.8183    0.9127    0.8629       607

    accuracy                         0.7714       936
   macro avg     0.6192    0.5434    0.5562       936
weighted avg     0.7455    0.7714    0.7514       936

