In [112]:
!pip install --upgrade pip  
!pip install emot
!pip install emoji
!pip install hazm

import pandas as pd
import numpy as np
import warnings
import preprocessor as prepr
import re
import ast
import hazm
import nltk

from tqdm.notebook import tqdm  
from multiprocessing import Pool
from sklearn import model_selection, naive_bayes, svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

warnings.filterwarnings('ignore')
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/akzanuali/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [10]:
data = pd.read_csv("MirasIrony.csv", names=['text', 'label'], index_col = None, header=None)

In [126]:
data.head(5)

Unnamed: 0,text,label
0,""" نماینده مجلس خطاب به قربانی اسیدپاشی : ازدس...",0
1,_آرره دیگه بعدش گفتم استاد دارید مسخره میکنید ...,1
2,_چجوری میتونی اصفهانی باشی و هوادار پرسپولیس؟ ...,1
3,- داداش #سربازی ت کی تموم میشه ایشالا؟ + چیزی ...,0
4,- در گذشته زندگی می کنی یا حال؟ اگر در گذشته ز...,1


In [125]:
with open('PERSIAN_EMOJIS.txt') as f:
    d = f.read()
PERSIAN_EMOJIS = ast.literal_eval(d)

with open('PERSIAN_EMOTICONS.txt') as f:
    d = f.read()
PERSIAN_EMOTICONS = ast.literal_eval(d)

stopwords = list(pd.read_csv("persian_stopwords.csv", names=['word', '-'], sep = ';',index_col = None, header=None)[1:]['word'].reset_index(drop=True))

In [107]:
stop_words = set(stopwords + ['«','»','،',':','؟','.',',','"','*'])
stemmer = hazm.Stemmer()
normalizer = hazm.Normalizer()

prepr.set_options(prepr.OPT.URL, prepr.OPT.MENTION, prepr.OPT.HASHTAG)

def convert_emoji_to_text(text):
    for emot in PERSIAN_EMOJIS:
        text = text.replace(emot, "_".join(PERSIAN_EMOJIS[emot].replace(",","").replace(":","").split()))
    return text

def convert_emoticon_to_text(text):
    for emot in PERSIAN_EMOTICONS:
        text = text.replace(emot, "_".join(PERSIAN_EMOTICONS[emot].replace(",","").replace(":","").split()))
    return text
        
def clean_text(text):
    text = str(text) 
    text = normalizer.normalize(text)
    text = convert_emoji_to_text(text)                                               #converting emojis to text
    text = convert_emoticon_to_text(text)  
    text = prepr.clean(text)                 
    text = [stemmer.stem(token) for token in hazm.word_tokenize(text)]      #stemmatize+tokenization
    text = [item for item in text if item not in stop_words]                  #stopwords
    
    return text

In [108]:
texts = data.text
cleaned_texts = []
for item in tqdm(texts, total = len(texts)):
    cleaned_texts.append(clean_text(item))

  0%|          | 0/2942 [00:00<?, ?it/s]

In [109]:
final_texts = []
for text in tqdm(cleaned_texts, total = len(cleaned_texts)):
    final_texts.append(' '.join(text))

  0%|          | 0/2942 [00:00<?, ?it/s]

In [127]:
from collections import Counter
cnt = Counter()
for text in cleaned_texts:
    for word in text:
        cnt[word] += 1

In [75]:
X_train, X_test, y_train, y_test = train_test_split(final_texts, data['label'].values, test_size=0.3, random_state=0)

In [76]:
from sklearn import metrics  
def metrics_result(actual, predict):                       
    print('Metrics:')
    print ('Accuracy: {0: .3f}'.format(metrics.accuracy_score(actual, predict)) ) 
    print ('Precision: {0: .3f}'.format(metrics.precision_score(actual, predict,average='weighted')) ) 
    print ('Recall: {0: 0.3f}'.format(metrics.recall_score(actual, predict,average='weighted'))   )
    print ('F1-score: {0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted'))   )

In [77]:
#TF Idf + Random Forest
text_clf = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier())
                     ])
 
text_clf.fit(X_train, y_train)
print('TF Idf + Random Forest:')
print()
predicted = text_clf.predict(X_test)
metrics_result(y_test, predicted)
print()

TF Idf + Random Forest:

Metrics:
Accuracy:  0.676
Precision:  0.676
Recall:  0.676
F1-score: 0.650



In [78]:
text_logreg = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('logreg', LogisticRegression())
                     ])
 
text_logreg.fit(X_train, y_train)

print('TF Idf + Logistic Regression:')
print()
predicted_l = text_logreg.predict(X_test)
metrics_result(y_test,predicted_l)
print()

TF Idf + Logistic Regression:

Metrics:
Accuracy:  0.650
Precision:  0.640
Recall:  0.650
F1-score: 0.638



In [79]:
text_nb = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('nb', naive_bayes.MultinomialNB())
                     ])
 
text_nb.fit(X_train, y_train)

print('TF Idf + Naive Bayes:')
print()
predicted_nb = text_nb.predict(X_test)
metrics_result(y_test,predicted_nb)
print()

TF Idf + Naive Bayes:

Metrics:
Accuracy:  0.658
Precision:  0.649
Recall:  0.658
F1-score: 0.640



In [80]:
text_svm = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('svm', svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto'))
                     ])
 
text_svm.fit(X_train, y_train)

print('TF Idf + SVM:')
print()
predicted_svm = text_svm.predict(X_test)
metrics_result(y_test,predicted_svm)
print()

TF Idf + SVM:

Metrics:
Accuracy:  0.635
Precision:  0.632
Recall:  0.635
F1-score: 0.634



In [81]:
text_knn = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('knn', KNeighborsClassifier(n_neighbors=7))
                     ])
 
text_knn.fit(X_train, y_train)

print('TF Idf + Knn:')
print()
predicted_knn = text_knn.predict(X_test)
metrics_result(y_test,predicted_knn)
print()

TF Idf + Knn:

Metrics:
Accuracy:  0.595
Precision:  0.586
Recall:  0.595
F1-score: 0.588



In [82]:
k_range = list(range(1, 10))
param_grid = dict(n_neighbors=k_range)
knn = KNeighborsClassifier()

text_knn_gr = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('knn_gr', GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', return_train_score=False,verbose=1))
                     ])
 
text_knn_gr.fit(X_train, y_train)

print('TF Idf + Knn + Grid Search:')
print()
predicted_knn_gr = text_knn_gr.predict(X_test)
metrics_result(y_test,predicted_knn_gr)
print()

Fitting 10 folds for each of 9 candidates, totalling 90 fits
TF Idf + Knn + Grid Search:

Metrics:
Accuracy:  0.610
Precision:  0.602
Recall:  0.610
F1-score: 0.604



In [83]:
text_gb = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('gb', GradientBoostingClassifier())
                     ])
 
text_gb.fit(X_train, y_train)

print('TF Idf + Gradient Boosting:')
print()
predicted_gb = text_gb.predict(X_test)
metrics_result(y_test,predicted_gb)
print()

TF Idf + Gradient Boosting:

Metrics:
Accuracy:  0.664
Precision:  0.657
Recall:  0.664
F1-score: 0.642



In [84]:
text_sgd = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('sgd', SGDClassifier())
                     ])
 
text_sgd.fit(X_train, y_train)

print('TF Idf + SGD:')
print()
predicted_sgd = text_sgd.predict(X_test)
metrics_result(y_test,predicted_sgd)
print()

TF Idf + SGD:

Metrics:
Accuracy:  0.608
Precision:  0.618
Recall:  0.608
F1-score: 0.611



In [85]:
text_dt = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('dt', DecisionTreeClassifier())
                     ])
 
text_dt.fit(X_train, y_train)

print('TF Idf + Decision Tree:')
print()
predicted_dt = text_dt.predict(X_test)
metrics_result(y_test,predicted_dt)
print()

TF Idf + Decision Tree:

Metrics:
Accuracy:  0.599
Precision:  0.593
Recall:  0.599
F1-score: 0.595



In [86]:
text_ab = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('ab', AdaBoostClassifier())
                     ])
 
text_ab.fit(X_train, y_train)

print('TF Idf + Ada Boost:')
print()
predicted_ab = text_ab.predict(X_test)
metrics_result(y_test,predicted_ab)
print()

TF Idf + Ada Boost:

Metrics:
Accuracy:  0.650
Precision:  0.640
Recall:  0.650
F1-score: 0.632



In [87]:
from gensim.models import Word2Vec

sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4)

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_train_w2v = np.array([vectorize(sentence) for sentence in X_train])
X_test_w2v = np.array([vectorize(sentence) for sentence in X_test])

In [88]:
clf = RandomForestClassifier()
 
clf.fit(X_train_w2v, y_train)
print('Word2Vec + Random Forest:')
print()
predicted = clf.predict(X_test_w2v)
metrics_result(y_test, predicted)
print()

Word2Vec + Random Forest:

Metrics:
Accuracy:  0.589
Precision:  0.579
Recall:  0.589
F1-score: 0.581



In [89]:
lgr = LogisticRegression()
 
lgr.fit(X_train_w2v, y_train)
print('Word2Vec + LogisticRegression:')
print()
predicted = lgr.predict(X_test_w2v)
metrics_result(y_test, predicted)
print()

Word2Vec + LogisticRegression:

Metrics:
Accuracy:  0.597
Precision:  0.529
Recall:  0.597
F1-score: 0.452



In [94]:
svmm = svm.SVC(C=2.0, kernel='linear', degree=1, gamma='auto')
 
svmm.fit(X_train_w2v, y_train)
print('Word2Vec + SVM:')
print()
predicted = svmm.predict(X_test_w2v)
metrics_result(y_test, predicted)
print()

Word2Vec + SVM:

Metrics:
Accuracy:  0.597
Precision:  0.356
Recall:  0.597
F1-score: 0.446



In [91]:
sgd = SGDClassifier()
 
sgd.fit(X_train_w2v, y_train)
print('Word2Vec + SGD:')
print()
predicted = sgd.predict(X_test_w2v)
metrics_result(y_test, predicted)
print()

Word2Vec + SGD:

Metrics:
Accuracy:  0.597
Precision:  0.356
Recall:  0.597
F1-score: 0.446

