In [1]:
import os
import sys
import pandas as pd
# adding classes folder to system path
sys.path.insert(0, os.path.abspath('..') + '/src')
from hazm import *
import hazm
import re
import numpy as np
from numpy.linalg import norm

# document-based polarity detection

## preprocessing

In [13]:
#loading documnet-based labeled data
doc_data = pd.read_csv('../data/fidibook.csv')
# doc_data1 = pd.read_csv('../data/fidibook_hannah.csv')
doc_data = pd.concat([doc_data, pd.read_csv('../data/fidibook_hannah.csv')], axis=0)
doc_data = pd.concat([doc_data, pd.read_csv('../data/amazon_paperwhite.csv')], axis=0)
doc_data = pd.concat([doc_data, pd.read_csv('../data/amazon_10thgeneration.csv')], axis=0)
doc_data['polarity'] = doc_data['polarity'].replace(1, '+1').replace(-1, '-1')
doc_data = doc_data.reset_index(drop=True)
doc_data.drop(doc_data.columns[doc_data.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

In [14]:
#number of all comments in dataset
print(doc_data.shape[0])
#number of positive and negative comments in document-level
print(doc_data['polarity'].value_counts())
doc_data

474
+1    348
-1    126
Name: polarity, dtype: int64


Unnamed: 0,date,auther,text,polarity
0,۶ آبان ۱۴۰۰,کاربر دیجی‌کالا,عالیه,+1
1,۱۹ مهر ۱۴۰۰,کاربر دیجی‌کالا,لطفا هرچه زودتر موجودش کنید,+1
2,۱۷ مهر ۱۴۰۰,متین خدامرادی,تو رو خدا موجودش کنید,+1
3,۱۵ مهر ۱۴۰۰,کاربر دیجی‌کالا,لفطا موجودش کنید,+1
4,۵ مهر ۱۴۰۰,متین ضیایی,بهترین کتاب خونیه که میتونید بخرید چون قیمتش ع...,+1
...,...,...,...,...
469,قاسم کنگرانی,۲۹ تیر ۱۴۰۰,بهترین وسیله برای مطالعه بخصوص با توجه به صنعت...,+1
470,۲۹ تیر ۱۴۰۰,کاربر دیجی‌کالا,خوبه .,+1
471,کاربر دیجی‌کالا,۱۱ تیر ۱۴۰۰,بهترین برای مطالعه کتاب ، تا وایفای رو وصل کرد...,+1
472,۱۱ تیر ۱۴۰۰,نازنین دهقان,سال هاست که کتاب خوان استفاده میکنم و این یکی ...,+1


In [15]:
train_data = doc_data.sample(frac=0.85,random_state=20)
test_data = doc_data.drop(train_data.index)
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)
print('polarity distribution in train set:\n', train_data['polarity'].value_counts())
print('polarity distribution in test set:\n', test_data['polarity'].value_counts())

polarity distribution in train set:
 +1    302
-1    101
Name: polarity, dtype: int64
polarity distribution in test set:
 +1    46
-1    25
Name: polarity, dtype: int64


In [16]:
def preprocess(data, PUNC_FLAG, STEM_FLAG, STWD_FLAG):
    words_dict = {}
    stop_words = hazm.utils.stopwords_list(stopwords_file= '/Users/atiyehghm/venv/lib/python3.7/site-packages/hazm/data/stopwords.dat')[:100]
    
    for m in range(data.shape[0]):
        comment = data.at[m, 'text']
        comment.replace("ي", "ی")
        #removing all puctuations
        if(PUNC_FLAG):
            comment = re.sub(r"[\[,.;:\-@#}?؟{٪٬*!)%’(&$<_>/'»«|+=.\]]+\ *", " ", comment)
        normalizer = Normalizer()
        comment = normalizer.normalize(comment)
        tokenizer = WordTokenizer()
        comment = tokenizer.tokenize(comment)
        new_comment = []
        #stem all words
        if(STEM_FLAG):
            stemmer = Stemmer()
            for t in comment:
                t1 = stemmer.stem(t)
                if(STWD_FLAG):
                    if t1 not in stop_words:
                        if t1 in words_dict.keys():
                            words_dict[t1]['frq'] += 1
                            word['docs'].add(m)
                        else:
                            word = {'id':len(words_dict), 'frq':1, 'docs':set()}
                            word['docs'].add(m)
                            words_dict[t1] = word
                        new_comment += [t1]
                else:
                    if t1 in words_dict.keys():
                        words_dict[t1]['frq'] += 1
                        word['docs'].add(m)
                    else:
                        word = {'id':len(words_dict), 'frq':1, 'docs':set()}
                        word['docs'].add(m)
                        words_dict[t1] = word
                        new_comment += [t1]
        else:
            for t in comment:
                if(STWD_FLAG):
                    if t not in stop_words:
                        if t in words_dict.keys():
                            words_dict[t]['frq'] += 1
                            word['docs'].add(m)
                        else:
                            word = {'id':len(words_dict), 'frq':1, 'docs':set()}
                            word['docs'].add(m)
                            words_dict[t] = word
                        new_comment += [t]
                else:
                    if t in words_dict.keys():
                        words_dict[t]['frq'] += 1
                        word['docs'].add(m)
                    else:
                        word = {'id':len(words_dict), 'frq':1, 'docs':set()}
                        word['docs'].add(m)
                        words_dict[t] = word
                        new_comment += [t]
        
        data.at[m, 'text'] = new_comment
    
    
    return data, words_dict

In [17]:
data = doc_data.copy()
processed_data, vocab = preprocess(data, True, True, False)
print('size of vocabulary:', len(vocab))

size of vocabulary: 3534


## tf-idf vectorization

In [18]:
def tfidf_vectorizer(data, PUNC_FLAG, STEM_FLAG, STWD_FLAG):
    preprocced_data, words_list = preprocess(data, PUNC_FLAG, STEM_FLAG, STWD_FLAG)
    n_docs = preprocced_data.shape[0]
    tfidf_vectors = np.zeros((n_docs, len(words_list)))
    for m in range(preprocced_data.shape[0]):
        comment = preprocced_data.at[m, 'text']
        for t in comment:
            word = words_list[t]
            df = len(word['docs'])
            idf = np.log((1+n_docs)/(1+df))
            tf = word['frq']
            tfidf_vectors[m][word['id']] = tf*(idf+1)
        n = norm(tfidf_vectors[m], axis=0, ord=2)
        if n != 0:
            tfidf_vectors[m] = tfidf_vectors[m]/n
    return tfidf_vectors, words_list

In [19]:
def tfidf_fit(vocab, data, PUNC_FLAG, STEM_FLAG, STWD_FLAG):
    preprocced_data, words_list = preprocess(data, PUNC_FLAG, STEM_FLAG, STWD_FLAG)
    n_docs = preprocced_data.shape[0]
    tfidf_vectors = np.zeros((n_docs, len(vocab)))
    for m in range(preprocced_data.shape[0]):
        comment = preprocced_data.at[m, 'text']
        for t in comment:
            if t in vocab.keys():
                word = vocab[t]
                df = len(word['docs'])
                idf = np.log((1+n_docs)/(1+df))
                tf = word['frq']
                tfidf_vectors[m][word['id']] = tf*(idf+1)
        n = norm(tfidf_vectors[m], axis=0, ord=2)
        if n != 0:
            tfidf_vectors[m] = tfidf_vectors[m]/n
    return tfidf_vectors

In [20]:
train = train_data.copy() 
train_vectors, vocab = tfidf_vectorizer(train, True, True, True)

In [21]:
len(vocab)

3191

In [22]:
test = test_data.copy() 
test_vectors = tfidf_fit(vocab, test,True, True, True)
print(train_vectors.shape)
print(test_vectors.shape)

(403, 3191)
(71, 3191)


## svm and naive bayes implementation

In [23]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn import metrics

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train_data['polarity'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(test_data['polarity'], prediction_linear, output_dict=True)
print('positive: ', report['+1'])
print('negative: ', report['-1'])
accuracy_score = metrics.accuracy_score(prediction_linear, test_data['polarity'])
print(accuracy_score)

Training time: 0.394714s; Prediction time: 0.066211s
positive:  {'precision': 0.647887323943662, 'recall': 1.0, 'f1-score': 0.7863247863247863, 'support': 46}
negative:  {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 25}
0.647887323943662


  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
from sklearn.naive_bayes import MultinomialNB
X_train = train_vectors
y_train = train_data['polarity']
X_test = test_vectors
y_test = test_data['polarity']
MNB = MultinomialNB()
MNB.fit(X_train, y_train)
predicted = MNB.predict(X_test)
report = classification_report(test_data['polarity'], predicted, output_dict=True)
print('positive: ', report['+1'])
print('negative: ', report['-1'])

positive:  {'precision': 0.647887323943662, 'recall': 1.0, 'f1-score': 0.7863247863247863, 'support': 46}
negative:  {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 25}


  _warn_prf(average, modifier, msg_start, len(result))


# sentence-based polarity detection

In [28]:
#loading sentence-based labeled data
sent_data = pd.read_csv('../data/sentence_fidibook.csv')
sent_data = pd.concat([sent_data, pd.read_csv('../data/fidibook_hannah_sentence.csv')], axis=0)
sent_data = pd.concat([sent_data, pd.read_csv('../data/amazon_paperwhite_sentence.csv')], axis=0)
sent_data = pd.concat([sent_data, pd.read_csv('../data/amazon_10thgeneration_sentence.csv')], axis=0)
sent_data.drop(sent_data[sent_data['polarity'] == 0].index, inplace=True)
sent_data.reset_index(drop=True, inplace = True)
sent_data['polarity'] = sent_data['polarity'].replace(1, '+1').replace(2, '-1')
print(sent_data.shape[0])
print(sent_data['polarity'].value_counts())
sent_data.drop(sent_data.columns[sent_data.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
sent_data.at[812, 'polarity'] = '+1'
sent_data.tail(10)

1040
+1    694
-1    345
Name: polarity, dtype: int64


Unnamed: 0,text,polarity
1030,کاملا سبکه .,1
1031,آگهی و تبلیغات نداره و تا روشن کردم به روز‌رس...,1
1032,دوسه‌تا دیکشنری خوب داره برای خواندن داره که ...,1
1033,خیلی راضی هستم .,1
1034,بهترین وسیله برای مطالعه بخصوص با توجه به صنعت...,1
1035,خوبه .,1
1036,بهترین برای مطالعه کتاب ، تا وایفای رو وصل کرد...,1
1037,من چون هدفم فقط کتاب خوانی‌بود برام فرقی نداش...,1
1038,سال هاست که کتاب خوان استفاده میکنم و این یکی ...,1
1039,با سلام و احترام عکس‌های محصول مربوط به نسخه‌ی...,-1


In [39]:
train_sent_data = sent_data.sample(frac=0.85,random_state=20)
test_sent_data = sent_data.drop(train_sent_data.index)
train_sent_data.reset_index(drop=True, inplace=True)
test_sent_data.reset_index(drop=True, inplace=True)
print('polarity distribution in train set:\n', train_sent_data['polarity'].value_counts())
print('polarity distribution in test set:\n', test_sent_data['polarity'].value_counts())

polarity distribution in train set:
 +1    585
-1    299
Name: polarity, dtype: int64
polarity distribution in test set:
 +1    110
-1     46
Name: polarity, dtype: int64


In [40]:
train = train_sent_data.copy() 
sent_train_vectors, sent_vocab = tfidf_vectorizer(train, True, True, True)

In [41]:
test = test_sent_data.copy() 
sent_test_vectors = tfidf_fit(sent_vocab, test,True, True, True)
print(sent_train_vectors.shape)
print(sent_test_vectors.shape)

(884, 2528)
(156, 2528)


In [42]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(sent_train_vectors, train_sent_data['polarity'])
t1 = time.time()
prediction_linear = classifier_linear.predict(sent_test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(test_sent_data['polarity'], prediction_linear, output_dict=True)
print('positive: ', report['+1'])
print('negative: ', report['-1'])
accuracy_score = metrics.accuracy_score(prediction_linear, test_sent_data['polarity'])
print(accuracy_score)

Training time: 2.037231s; Prediction time: 0.315505s
positive:  {'precision': 0.7172413793103448, 'recall': 0.9454545454545454, 'f1-score': 0.8156862745098039, 'support': 110}
negative:  {'precision': 0.45454545454545453, 'recall': 0.10869565217391304, 'f1-score': 0.1754385964912281, 'support': 46}
0.6987179487179487


In [45]:
from sklearn.naive_bayes import MultinomialNB
X_train = sent_train_vectors
y_train = train_sent_data['polarity']
X_test = sent_test_vectors
y_test = test_sent_data['polarity']
BNB = MultinomialNB()
BNB.fit(X_train, y_train)
predicted = BNB.predict(X_test)
report = classification_report(test_sent_data['polarity'], predicted, output_dict=True)
print('positive: ', report['+1'])
print('negative: ', report['-1'])

positive:  {'precision': 0.7051282051282052, 'recall': 1.0, 'f1-score': 0.8270676691729323, 'support': 110}
negative:  {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 46}


  _warn_prf(average, modifier, msg_start, len(result))
