In [17]:
'''
จากบทความ "ลองเล่น Sentiment Analysis"
โดย Nuipin Decimo
https://bit.ly/3cMgR5a
'''

'\nจากบทความ "ลองเล่น Sentiment Analysis"\nโดย Nuipin Decimo\nhttps://bit.ly/3cMgR5a\n'

In [18]:
# ใช้ตัดคำภาษาไทย
import deepcut
# ใช้งาน regex
import re
# จัดการเกี่ยวกับ array
import numpy as np
# สำหรับทำ classify และทดสอบโมเดล
from nltk import FreqDist, precision, recall, f_measure, NaiveBayesClassifier
from nltk.classify import apply_features
from nltk.classify import util
# สำหรับสร้างชุดข้อมูลสำหรับ train และ test เพื่อทดสอบประสิทธิภาพ
from sklearn.model_selection import KFold
import collections, itertools
# for save model
import pickle

In [19]:
# Thai Sentiment Text Analysis
# คลังข้อมูลสำหรับ Sentiment ภาษาไทย โดย นาย วรรณพงษ์ ภัททิยไพบูลย์
# https://github.com/PyThaiNLP/lexicon-thai

data_pos = [(line.strip(), 'pos') for line in open("pos.txt", 'r', encoding="utf8")]
data_neg = [(line.strip(), 'neg') for line in open("neg.txt", 'r', encoding="utf8")]

In [25]:
len(data_pos)

467

In [26]:
len(data_neg)

596

In [20]:
def split_words (sentence):
    return deepcut.tokenize(''.join(sentence.lower().split()))
    
sentences = [(split_words(sentence), sentiment) for (sentence, sentiment) in data_pos + data_neg + data_neutral]

In [21]:
def get_words_in_sentences(sentences):
    all_words = []
    for (words, sentiment) in sentences:
        all_words.extend(words)
    return all_words

In [22]:
def get_word_features(wordlist):
    wordlist = FreqDist(wordlist)
    word_features = [word[0] for word in wordlist.most_common()]
    return word_features

In [23]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [24]:
features_data = np.array(sentences)
# แบ่งข้อมูลเป็น 10 ชุด โดยไม่เรียง
k_fold = KFold(n_splits=10, random_state=1992, shuffle=True)
word_features = None
accuracy_scores = []
for train_set, test_set in k_fold.split(features_data):
    word_features = get_word_features(get_words_in_sentences(features_data[train_set].tolist()))
    train_features = apply_features(extract_features, features_data[train_set].tolist())
    test_features = apply_features(extract_features, features_data[test_set].tolist())
    classifier = NaiveBayesClassifier.train(train_features)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(test_features):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    accuracy_score = util.accuracy(classifier, test_features)
    print('train: {} test: {}'.format(len(train_set), len(test_set)))
    print('=================== Results ===================')
    print('Accuracy {:f}'.format(accuracy_score))
    print('            Positive     Negative    Neutral')
    print('F1         [{:f}     {:f}]'.format(
        f_measure(refsets['pos'], testsets['pos']),
        f_measure(refsets['neg'], testsets['neg'])
    ))
    print('Precision  [{:f}     {:f}]'.format(
        precision(refsets['pos'], testsets['pos']),
        precision(refsets['neg'], testsets['neg'])
    ))
    print('Recall     [{:f}     {:f}]'.format(
        recall(refsets['pos'], testsets['pos']),
        recall(refsets['neg'], testsets['neg'])
    ))
    print('===============================================\n')
    f = open('my_classifier.pickle', 'wb')
    pickle.dump(classifier, f)
    f.close()

  features_data = np.array(sentences)


train: 1071 test: 120
Accuracy 0.733333
            Positive     Negative    Neutral
F1         [0.729412     0.794326]
Precision  [0.885714     0.666667]
Recall     [0.620000     0.982456]

train: 1072 test: 119
Accuracy 0.655462
            Positive     Negative    Neutral
F1         [0.640000     0.729730]
Precision  [0.923077     0.580645]
Recall     [0.489796     0.981818]

train: 1072 test: 119
Accuracy 0.638655
            Positive     Negative    Neutral
F1         [0.583333     0.723684]
Precision  [0.875000     0.578947]
Recall     [0.437500     0.964912]

train: 1072 test: 119
Accuracy 0.705882
            Positive     Negative    Neutral
F1         [0.674419     0.769231]
Precision  [0.828571     0.662651]
Recall     [0.568627     0.916667]

train: 1072 test: 119
Accuracy 0.630252
            Positive     Negative    Neutral
F1         [0.548387     0.725000]
Precision  [0.850000     0.585859]
Recall     [0.404762     0.950820]

train: 1072 test: 119
Accuracy 0.764706
     

# HOW IT WORK?

In [27]:
pos_tweets=[('I love this car','positive'), 
    ('This view is amazing','positive'),
    ('I feel great this morning','positive'),
    ('I am so excited about the concert','positive'),
    ('He is my best friend','positive')]

In [28]:
neg_tweets=[('I do not like this car','negative'),
    ('This view is horrible','negative'),
    ('I feel tired this morning','negative'),
    ('I am not looking forward to the concert','negative'),
    ('He is my enemy','negative')]

In [29]:
tweets=[]
for(words,sentiment)in pos_tweets+neg_tweets:
    words_filtered=[e.lower() for e in words.split() if len(e)>=3]
    tweets.append((words_filtered,sentiment))

In [30]:
tweets

[(['love', 'this', 'car'], 'positive'),
 (['this', 'view', 'amazing'], 'positive'),
 (['feel', 'great', 'this', 'morning'], 'positive'),
 (['excited', 'about', 'the', 'concert'], 'positive'),
 (['best', 'friend'], 'positive'),
 (['not', 'like', 'this', 'car'], 'negative'),
 (['this', 'view', 'horrible'], 'negative'),
 (['feel', 'tired', 'this', 'morning'], 'negative'),
 (['not', 'looking', 'forward', 'the', 'concert'], 'negative'),
 (['enemy'], 'negative')]

In [31]:
word_features = get_word_features(get_words_in_sentences(tweets))

In [32]:
word_features

['this',
 'car',
 'view',
 'feel',
 'morning',
 'the',
 'concert',
 'not',
 'love',
 'amazing',
 'great',
 'excited',
 'about',
 'best',
 'friend',
 'like',
 'horrible',
 'tired',
 'looking',
 'forward',
 'enemy']

In [33]:
training_set = apply_features(extract_features, tweets)

In [34]:
training_set

[({'contains(this)': True, 'contains(car)': True, 'contains(view)': False, 'contains(feel)': False, 'contains(morning)': False, 'contains(the)': False, 'contains(concert)': False, 'contains(not)': False, 'contains(love)': True, 'contains(amazing)': False, 'contains(great)': False, 'contains(excited)': False, 'contains(about)': False, 'contains(best)': False, 'contains(friend)': False, 'contains(like)': False, 'contains(horrible)': False, 'contains(tired)': False, 'contains(looking)': False, 'contains(forward)': False, 'contains(enemy)': False}, 'positive'), ({'contains(this)': True, 'contains(car)': False, 'contains(view)': True, 'contains(feel)': False, 'contains(morning)': False, 'contains(the)': False, 'contains(concert)': False, 'contains(not)': False, 'contains(love)': False, 'contains(amazing)': True, 'contains(great)': False, 'contains(excited)': False, 'contains(about)': False, 'contains(best)': False, 'contains(friend)': False, 'contains(like)': False, 'contains(horrible)': Fa