In [32]:
!pip install --upgrade pip  
!pip install emot
!pip install emoji
!pip install farasapy

import pandas as pd
import numpy as np
import warnings
import preprocessor as prepr
import re
import farasa
import ast
import nltk

from nltk.tokenize import word_tokenize
from nltk.stem.isri import ISRIStemmer
from nltk.corpus import stopwords
from tqdm.notebook import tqdm  
from multiprocessing import Pool
from farasa.segmenter import FarasaSegmenter
from farasa.stemmer import FarasaStemmer
from sklearn import model_selection, naive_bayes, svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

warnings.filterwarnings('ignore')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/akzanuali/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/akzanuali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [33]:
data_train = pd.read_csv("arabic_data_train.csv", index_col = None)[['text','sarcastic']]
data_test = pd.read_csv("arabic_data_test.csv", index_col = None)[['text','sarcastic']]

data = pd.concat([data_train, data_test]).reset_index(drop=True)

In [34]:
with open('ARABIC_EMOJIS.txt') as f:
    d = f.read()
ARABIC_EMOJIS = ast.literal_eval(d)

with open('ARABIC_EMOTICONS') as f:
    d = f.read()
ARABIC_EMOTICONS = ast.literal_eval(d)

In [35]:
stop_words = stopwords.words('arabic')
#               + ['«','»','،',':','؟','.',',','"','*'])
stemmer = ISRIStemmer()
prepr.set_options(prepr.OPT.URL, prepr.OPT.MENTION, prepr.OPT.HASHTAG)

def convert_emoji_to_text(text):
    for emot in ARABIC_EMOJIS:
        text = text.replace(emot, "_".join(ARABIC_EMOJIS[emot].replace(",","").replace(":","").split()))
    return text

def convert_emoticon_to_text(text):
    for emot in ARABIC_EMOTICONS:
        text = text.replace(emot, "_".join(ARABIC_EMOTICONS[emot].replace(",","").replace(":","").split()))
    return text
        
def clean_text(text):
    text = str(text) 
    text = convert_emoji_to_text(text)                                               #converting emojis to text
    text = convert_emoticon_to_text(text)  
    text = prepr.clean(text)                 
    text = [stemmer.stem(token) for token in word_tokenize(text)]      #stemmatize+tokenization
    text = [item for item in text if item not in stop_words]                  #stopwords
    return text

In [36]:
texts = data.text
cleaned_texts = []
for item in tqdm(texts, total = len(texts)):
    cleaned_texts.append(clean_text(item))

  0%|          | 0/4502 [00:00<?, ?it/s]

In [37]:
final_texts = []
for text in tqdm(cleaned_texts, total = len(cleaned_texts)):
    final_texts.append(' '.join(text))

  0%|          | 0/4502 [00:00<?, ?it/s]

In [38]:
from collections import Counter
cnt = Counter()
for text in cleaned_texts:
    for word in text:
        cnt[word] += 1
        
cnt.most_common(10)

[('الل', 448),
 ('.', 435),
 ('ان', 369),
 (':', 321),
 ('!', 270),
 ('جمع', 226),
 ('فى', 204),
 ('مصر', 198),
 ('انا', 194),
 ('حمد', 194)]

In [39]:
X_train, X_test, y_train, y_test = train_test_split(final_texts, data['sarcastic'].values, test_size=0.3, random_state=0)

In [40]:
from sklearn import metrics  
def metrics_result(actual, predict):                       
    print('Metrics:')
    print ('Accuracy: {0: .3f}'.format(metrics.accuracy_score(actual, predict)) ) 
    print ('Precision: {0: .3f}'.format(metrics.precision_score(actual, predict,average='weighted')) ) 
    print ('Recall: {0: 0.3f}'.format(metrics.recall_score(actual, predict,average='weighted'))   )
    print ('F1-score: {0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted'))   )

In [41]:
#TF Idf + Random Forest
text_clf = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier())
                     ])
 
text_clf.fit(X_train, y_train)
print('TF Idf + Random Forest:')
print()
predicted = text_clf.predict(X_test)
metrics_result(y_test, predicted)
print()

TF Idf + Random Forest:

Metrics:
Accuracy:  0.801
Precision:  0.776
Recall:  0.801
F1-score: 0.763



In [47]:
text_logreg = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('logreg', LogisticRegression())
                     ])
 
text_logreg.fit(X_train, y_train)

print('TF Idf + Logistic Regression:')
print()
predicted_l = text_logreg.predict(X_test)
metrics_result(y_test,predicted_l)
print()

TF Idf + Logistic Regression:

Metrics:
Accuracy:  0.794
Precision:  0.793
Recall:  0.794
F1-score: 0.726



In [48]:
text_nb = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('nb', naive_bayes.MultinomialNB())
                     ])
 
text_nb.fit(X_train, y_train)

print('TF Idf + Naive Bayes:')
print()
predicted_nb = text_nb.predict(X_test)
metrics_result(y_test,predicted_nb)
print()

TF Idf + Naive Bayes:

Metrics:
Accuracy:  0.782
Precision:  0.830
Recall:  0.782
F1-score: 0.691



In [49]:
text_svm = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('svm', svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto'))
                     ])
 
text_svm.fit(X_train, y_train)

print('TF Idf + SVM:')
print()
predicted_svm = text_svm.predict(X_test)
metrics_result(y_test,predicted_svm)
print()

TF Idf + SVM:

Metrics:
Accuracy:  0.805
Precision:  0.784
Recall:  0.805
F1-score: 0.769



In [50]:
text_knn = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('knn', KNeighborsClassifier(n_neighbors=7))
                     ])
 
text_knn.fit(X_train, y_train)

print('TF Idf + Knn:')
print()
predicted_knn = text_knn.predict(X_test)
metrics_result(y_test,predicted_knn)
print()

TF Idf + Knn:

Metrics:
Accuracy:  0.801
Precision:  0.787
Recall:  0.801
F1-score: 0.748



In [51]:
k_range = list(range(1, 10))
param_grid = dict(n_neighbors=k_range)
knn = KNeighborsClassifier()

text_knn_gr = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('knn_gr', GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', return_train_score=False,verbose=1))
                     ])
 
text_knn_gr.fit(X_train, y_train)

print('TF Idf + Knn + Grid Search:')
print()
predicted_knn_gr = text_knn_gr.predict(X_test)
metrics_result(y_test,predicted_knn_gr)
print()

Fitting 10 folds for each of 9 candidates, totalling 90 fits
TF Idf + Knn + Grid Search:

Metrics:
Accuracy:  0.797
Precision:  0.780
Recall:  0.797
F1-score: 0.742



In [52]:
text_gb = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('gb', GradientBoostingClassifier())
                     ])
 
text_gb.fit(X_train, y_train)

print('TF Idf + Gradient Boosting:')
print()
predicted_gb = text_gb.predict(X_test)
metrics_result(y_test,predicted_gb)
print()

TF Idf + Gradient Boosting:

Metrics:
Accuracy:  0.793
Precision:  0.768
Recall:  0.793
F1-score: 0.735



In [53]:
text_sgd = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('sgd', SGDClassifier())
                     ])
 
text_sgd.fit(X_train, y_train)

print('TF Idf + SGD:')
print()
predicted_sgd = text_sgd.predict(X_test)
metrics_result(y_test,predicted_sgd)
print()

TF Idf + SGD:

Metrics:
Accuracy:  0.799
Precision:  0.784
Recall:  0.799
F1-score: 0.789



In [54]:
text_dt = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('dt', DecisionTreeClassifier())
                     ])
 
text_dt.fit(X_train, y_train)

print('TF Idf + Decision Tree:')
print()
predicted_dt = text_dt.predict(X_test)
metrics_result(y_test,predicted_dt)
print()

TF Idf + Decision Tree:

Metrics:
Accuracy:  0.759
Precision:  0.747
Recall:  0.759
F1-score: 0.752



In [55]:
text_ab = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('ab', AdaBoostClassifier())
                     ])
 
text_ab.fit(X_train, y_train)

print('TF Idf + Ada Boost:')
print()
predicted_ab = text_ab.predict(X_test)
metrics_result(y_test,predicted_ab)
print()

TF Idf + Ada Boost:

Metrics:
Accuracy:  0.787
Precision:  0.752
Recall:  0.787
F1-score: 0.750



In [56]:
from gensim.models import Word2Vec

sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4)

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_train_w2v = np.array([vectorize(sentence) for sentence in X_train])
X_test_w2v = np.array([vectorize(sentence) for sentence in X_test])

In [57]:
clf = RandomForestClassifier()
 
clf.fit(X_train_w2v, y_train)
print('Word2Vec + Random Forest:')
print()
predicted = clf.predict(X_test_w2v)
metrics_result(y_test, predicted)
print()

Word2Vec + Random Forest:

Metrics:
Accuracy:  0.778
Precision:  0.720
Recall:  0.778
F1-score: 0.697



In [58]:
lgr = LogisticRegression()
 
lgr.fit(X_train_w2v, y_train)
print('Word2Vec + LogisticRegression:')
print()
predicted = lgr.predict(X_test_w2v)
metrics_result(y_test, predicted)
print()

Word2Vec + LogisticRegression:

Metrics:
Accuracy:  0.778
Precision:  0.605
Recall:  0.778
F1-score: 0.681



In [61]:
svmm = svm.SVC(C=5.0, kernel='linear', degree=5, gamma='auto')
 
svmm.fit(X_train_w2v, y_train)
print('Word2Vec + SVM:')
print()
predicted = svmm.predict(X_test_w2v)
metrics_result(y_test, predicted)
print()

Word2Vec + SVM:

Metrics:
Accuracy:  0.778
Precision:  0.605
Recall:  0.778
F1-score: 0.681



In [60]:
sgd = SGDClassifier()
 
sgd.fit(X_train_w2v, y_train)
print('Word2Vec + SGD:')
print()
predicted = sgd.predict(X_test_w2v)
metrics_result(y_test, predicted)
print()

Word2Vec + SGD:

Metrics:
Accuracy:  0.778
Precision:  0.605
Recall:  0.778
F1-score: 0.681

