# Sentiment Analysis

### https://medium.com/@decimo.me/ควันว่างกับ-sentiment-analysis-a79b46ef9e77

In [1]:
# ใช้ตัดคำภาษาไทย
import deepcut
# ใช้งาน regex
import re
# จัดการเกี่ยวกับ array
import numpy as np
import pandas as pd
# สำหรับทำ classify และทดสอบโมเดล
from nltk import FreqDist, precision, recall, f_measure, NaiveBayesClassifier
from nltk.classify import apply_features
from nltk.classify import util
# สำหรับสร้างชุดข้อมูลสำหรับ train และ test เพื่อทดสอบประสิทธิภาพ
from sklearn.model_selection import KFold
import collections, itertools

In [25]:
data_pos = [(line.strip(), 'pos') for line in open("pos.txt", 'r')]
data_neg = [(line.strip(), 'neg') for line in open("neg.txt", 'r')]

In [26]:
def split_words (sentence):
    return deepcut.tokenize(''.join(sentence.lower().split()))
sentences = [(split_words(sentence), sentiment) for (sentence, sentiment) in data_pos + data_neg]

In [27]:
sentences

[(['ดี'], 'pos'),
 ([''], 'pos'),
 (['ขอบคุณ'], 'pos'),
 ([''], 'pos'),
 (['สู้', 'เสมอ'], 'pos'),
 ([''], 'pos'),
 (['ผ่าน'], 'pos'),
 ([''], 'pos'),
 (['โคตรสนุก'], 'pos'),
 ([''], 'pos'),
 (['สุดยอด'], 'pos'),
 ([''], 'pos'),
 (['น่ารัก'], 'pos'),
 ([''], 'pos'),
 (['ขอบคุณ', 'ทุก'], 'pos'),
 ([''], 'pos'),
 (['คง', 'คิด', 'ถึง'], 'pos'),
 ([''], 'pos'),
 (['เก็บ', 'เวลา', 'ที่', 'ดี', 'ให้', 'เรา', 'ได้', 'จดจำ'], 'pos'),
 ([''], 'pos'),
 (['ทำ', 'ข้อ', 'สอบ', 'ให้', 'ได้'], 'pos'),
 ([''], 'pos'),
 (['สู้'], 'pos'),
 ([''], 'pos'),
 (['รอย', 'ยิ้ม', 'ใน', 'ความทรงจำ'], 'pos'),
 ([''], 'pos'),
 (['ทุก', 'นี้', 'ฉัน', 'คิด', 'ถึง'], 'pos'),
 ([''], 'pos'),
 (['ความทรงจำ', 'ที่', 'ดี', 'เหลือเกิน'], 'pos'),
 ([''], 'pos'),
 (['รัก', 'มาก', 'คิด', 'ถึง', 'มาก'], 'pos'),
 ([''], 'pos'),
 (['ดีใจ'], 'pos'),
 ([''], 'pos'),
 (['รัก'], 'pos'),
 ([''], 'pos'),
 (['ห่วง'], 'pos'),
 ([''], 'pos'),
 (['รัก', 'แฟน', 'ขี้', 'บ่น', 'จัง'], 'pos'),
 ([''], 'pos'),
 (['ขอบคุณ', 'ผู้ใจ', 'ดี'], 'po

In [31]:
def get_words_in_sentences(sentences):
    all_words = []
    for (words, sentiment) in sentences:
        all_words.extend(words)
    return all_words

In [28]:
def get_word_features(wordlist):
    wordlist = FreqDist(wordlist)
    word_features = [word[0] for word in wordlist.most_common()]
    return word_features

In [29]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [32]:
features_data = np.array(sentences)
# แบ่งข้อมูลเป็น 10 ชุด โดยไม่เรียง
k_fold = KFold(n_splits=10, random_state=1992, shuffle=True)
word_features = None
accuracy_scores = []
for train_set, test_set in k_fold.split(features_data):
    word_features = get_word_features(get_words_in_sentences(features_data[train_set].tolist()))
    train_features = apply_features(extract_features, features_data[train_set].tolist())
    test_features = apply_features(extract_features, features_data[test_set].tolist())
    classifier = NaiveBayesClassifier.train(train_features)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
    for i, (feats, label) in enumerate(test_features):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    accuracy_score = util.accuracy(classifier, test_features)
    print('train: {} test: {}'.format(len(train_set), len(test_set)))
    print('=================== Results ===================')
    print('Accuracy {:f}'.format(accuracy_score))
    print('            Positive     Negative')
    print('F1         [{:f}     {:f}]'.format(
        f_measure(refsets['pos'], testsets['pos']),
        f_measure(refsets['neg'], testsets['neg'])
    ))
    print('Precision  [{:f}     {:f}]'.format(
        precision(refsets['pos'], testsets['pos']),
        precision(refsets['neg'], testsets['neg'])
    ))
    print('Recall     [{:f}     {:f}]'.format(
        recall(refsets['pos'], testsets['pos']),
        recall(refsets['neg'], testsets['neg'])
    ))
    print('===============================================\n')

train: 1911 test: 213
Accuracy 0.704225
            Positive     Negative
F1         [0.540146     0.782007]
Precision  [0.804348     0.676647]
Recall     [0.406593     0.926230]

train: 1911 test: 213
Accuracy 0.690141
            Positive     Negative
F1         [0.500000     0.775510]
Precision  [0.891892     0.647727]
Recall     [0.347368     0.966102]

train: 1911 test: 213
Accuracy 0.694836
            Positive     Negative
F1         [0.480000     0.784053]
Precision  [0.833333     0.666667]
Recall     [0.337079     0.951613]

train: 1911 test: 213
Accuracy 0.652582
            Positive     Negative
F1         [0.439394     0.748299]
Precision  [0.805556     0.621469]
Recall     [0.302083     0.940171]

train: 1912 test: 212
Accuracy 0.655660
            Positive     Negative
F1         [0.474820     0.743860]
Precision  [0.891892     0.605714]
Recall     [0.323529     0.963636]

train: 1912 test: 212
Accuracy 0.712264
            Positive     Negative
F1         [0.504065     0