In [2]:
# sys module
import os
import sys
import nltk
from nltk.corpus import stopwords
from nltk.classify import naivebayes
import string

# third parties module
import pandas as pd

# local module
from data import load_data_sentiment, load_lexicon_sentiment

In [3]:
data = load_data_sentiment ()

In [4]:
# preprocessing
# remove stop words and puctuation
stopword = stopwords.words ('english')
def preprocessing (comment):
    # remove punctuation
    for p in string.punctuation:
        comment = comment.replace (p, ' ')  
    # tokenize
    words = set (nltk.word_tokenize (comment.lower ()))
    # remove char less than 3
    words = set ([w for w in words if len (w) > 3])  
    # remove stopwords
    words = words - set (stopword)
    
    return words

data['word'] = data['comment'].apply (preprocessing)


In [5]:
# feature extraction
def extract_feature (word):
    feature = {}
    for w in word:
        feature[w] = True
    return feature
data['feature'] = data['word'].apply (extract_feature)
data.loc[0, 'feature']

{'distressed': True,
 'slow': True,
 'drifting': True,
 'movie': True,
 'moving': True,
 'aimless': True,
 'young': True}

In [6]:
# create classifier
classifier = naivebayes.NaiveBayesClassifier.train (data[['feature', 'sentiment']].values)

In [7]:
# most important feature
classifier.show_most_informative_features (50)

Most Informative Features
               wonderful = True                1 : 0      =     10.9 : 1.0
                   waste = True                0 : 1      =      8.9 : 1.0
                  stupid = True                0 : 1      =      8.9 : 1.0
                 nothing = True                0 : 1      =      7.5 : 1.0
                  played = True                1 : 0      =      5.9 : 1.0
                   loved = True                1 : 0      =      5.3 : 1.0
                   liked = True                1 : 0      =      5.3 : 1.0
                   would = True                0 : 1      =      5.0 : 1.0
                   white = True                1 : 0      =      4.7 : 1.0
                   makes = True                1 : 0      =      4.7 : 1.0
                   lines = True                0 : 1      =      4.6 : 1.0
                    girl = True                0 : 1      =      4.6 : 1.0
                  action = True                0 : 1      =      4.6 : 1.0

In [8]:
# predict
data['prediction'] = data['feature'].apply (classifier.classify)

In [10]:
# show between target and prediction
print (data[['sentiment', 'prediction']].head (5))
data['is_true'] = (data['sentiment'] == data['prediction']).astype (int)
accuracy = sum (data['is_true']) / data.shape[0]
print ("Accuracy : {:.2F}".format (accuracy))

   sentiment  prediction
0          0           0
1          0           0
2          0           0
3          0           0
4          1           1
Accuracy : 0.96
